From eae1162d0c821686f96a3793f4f8ade85847103b Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Wed, 3 Nov 2021 13:58:26 -0400 Subject: [PATCH] Port audio manipulation tutorial --- docs/source/index.rst | 65 ++- .../audio_data_augmentation_tutorial.py | 414 ++++++++++++++++ examples/tutorials/audio_datasets_tutorial.py | 93 ++++ .../audio_feature_augmentation_tutorial.py | 156 ++++++ .../audio_feature_extractions_tutorial.py | 464 ++++++++++++++++++ examples/tutorials/audio_io_tutorial.py | 438 +++++++++++++++++ .../tutorials/audio_resampling_tutorial.py | 441 +++++++++++++++++ 7 files changed, 2049 insertions(+), 22 deletions(-) create mode 100644 examples/tutorials/audio_data_augmentation_tutorial.py create mode 100644 examples/tutorials/audio_datasets_tutorial.py create mode 100644 examples/tutorials/audio_feature_augmentation_tutorial.py create mode 100644 examples/tutorials/audio_feature_extractions_tutorial.py create mode 100644 examples/tutorials/audio_io_tutorial.py create mode 100644 examples/tutorials/audio_resampling_tutorial.py diff --git a/docs/source/index.rst b/docs/source/index.rst index 59a2c7f094..337e01bcb2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -25,8 +25,11 @@ Features described in this documentation are classified by release status: The :mod:`torchaudio` package consists of I/O, popular datasets and common audio transformations. +Package References +------------------ + .. toctree:: - :maxdepth: 2 + :maxdepth: 1 :caption: Package Reference torchaudio @@ -42,29 +45,33 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio utils prototype +Getting Started +--------------- + .. toctree:: - :maxdepth: 2 - :caption: Tutorials + :maxdepth: 1 + :caption: Getting Started - tutorials/speech_recognition_pipeline_tutorial - tutorials/forced_alignment_tutorial - tutorials/tacotron2_pipeline_tutorial + tutorials/audio_io_tutorial + tutorials/audio_resampling_tutorial + tutorials/audio_data_augmentation_tutorial + tutorials/audio_feature_extractions_tutorial + tutorials/audio_feature_augmentation_tutorial + tutorials/audio_datasets_tutorial + +Advanced Usages +--------------- .. toctree:: :maxdepth: 1 - :caption: PyTorch Libraries - - PyTorch - torchaudio - torchtext - torchvision - TorchElastic - TorchServe - PyTorch on XLA Devices + :caption: Advanced Usages + tutorials/speech_recognition_pipeline_tutorial + tutorials/forced_alignment_tutorial + tutorials/tacotron2_pipeline_tutorial Citing torchaudio -~~~~~~~~~~~~~~~~~ +----------------- If you find torchaudio useful, please cite the following paper: @@ -81,13 +88,27 @@ In BibTeX format: @article{yang2021torchaudio, title={TorchAudio: Building Blocks for Audio and Speech Processing}, - author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and Anjali Chourdia and Artyom Astafurov - and Caroline Chen and Ching-Feng Yeh and Christian Puhrsch and David Pollack and - Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and Jason Lian and Jay - Mahadeokar and Jeff Hwang and Ji Chen and Peter Goldsborough and Prabhat Roy and - Sean Narenthiran and Shinji Watanabe and Soumith Chintala and Vincent - Quenneville-Bélair and Yangyang Shi}, + author={Yao-Yuan Yang and Moto Hira and Zhaoheng Ni and + Anjali Chourdia and Artyom Astafurov and Caroline Chen and + Ching-Feng Yeh and Christian Puhrsch and David Pollack and + Dmitriy Genzel and Donny Greenberg and Edward Z. Yang and + Jason Lian and Jay Mahadeokar and Jeff Hwang and Ji Chen and + Peter Goldsborough and Prabhat Roy and Sean Narenthiran and + Shinji Watanabe and Soumith Chintala and + Vincent Quenneville-Bélair and Yangyang Shi}, journal={arXiv preprint arXiv:2110.15018}, year={2021} } +.. toctree:: + :maxdepth: 1 + :caption: PyTorch Libraries + :hidden: + + PyTorch + torchaudio + torchtext + torchvision + TorchElastic + TorchServe + PyTorch on XLA Devices diff --git a/examples/tutorials/audio_data_augmentation_tutorial.py b/examples/tutorials/audio_data_augmentation_tutorial.py new file mode 100644 index 0000000000..e5575d47cf --- /dev/null +++ b/examples/tutorials/audio_data_augmentation_tutorial.py @@ -0,0 +1,414 @@ +# -*- coding: utf-8 -*- +""" +Audio Data Augmentation +======================= + +``torchaudio`` provides a variety of ways to augment audio data. +""" + +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio + +import torch +import torchaudio +import torchaudio.functional as F + +print(torch.__version__) +print(torchaudio.__version__) + +###################################################################### +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- +# + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. + +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- + +import math +import os +import requests + +import matplotlib.pyplot as plt +from IPython.display import Audio, display + + +_SAMPLE_DIR = "_assets" + +SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav" +SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav") + +SAMPLE_RIR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav" +SAMPLE_RIR_PATH = os.path.join(_SAMPLE_DIR, "rir.wav") + +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") + +SAMPLE_NOISE_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/distant-16k/distractors/rm1/babb/Lab41-SRI-VOiCES-rm1-babb-mc01-stu-clo.wav" +SAMPLE_NOISE_PATH = os.path.join(_SAMPLE_DIR, "bg.wav") + +os.makedirs(_SAMPLE_DIR, exist_ok=True) + +def _fetch_data(): + uri = [ + (SAMPLE_WAV_URL, SAMPLE_WAV_PATH), + (SAMPLE_RIR_URL, SAMPLE_RIR_PATH), + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + (SAMPLE_NOISE_URL, SAMPLE_NOISE_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) + +_fetch_data() + +def _get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + +def get_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_PATH, resample=resample) + +def get_speech_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) + +def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].plot(time_axis, waveform[c], linewidth=1) + axes[c].grid(True) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + if ylim: + axes[c].set_ylim(ylim) + figure.suptitle(title) + plt.show(block=False) + +def print_stats(waveform, sample_rate=None, src=None): + if src: + print("-" * 10) + print("Source:", src) + print("-" * 10) + if sample_rate: + print("Sample Rate:", sample_rate) + print("Shape:", tuple(waveform.shape)) + print("Dtype:", waveform.dtype) + print(f" - Max: {waveform.max().item():6.3f}") + print(f" - Min: {waveform.min().item():6.3f}") + print(f" - Mean: {waveform.mean().item():6.3f}") + print(f" - Std Dev: {waveform.std().item():6.3f}") + print() + print(waveform) + print() + +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def get_rir_sample(*, resample=None, processed=False): + rir_raw, sample_rate = _get_sample(SAMPLE_RIR_PATH, resample=resample) + if not processed: + return rir_raw, sample_rate + rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)] + rir = rir / torch.norm(rir, p=2) + rir = torch.flip(rir, [1]) + return rir, sample_rate + +def get_noise_sample(*, resample=None): + return _get_sample(SAMPLE_NOISE_PATH, resample=resample) + + +###################################################################### +# Applying effects and filtering +# ------------------------------ +# +# ``torchaudio.sox_effects`` allows for directly applying filters similar to +# those available in ``sox`` to Tensor objects and file object audio sources. +# +# There are two functions for this: +# +# - ``torchaudio.sox_effects.apply_effects_tensor`` for applying effects +# to Tensor. +# - ``torchaudio.sox_effects.apply_effects_file`` for applying effects to +# other audio sources. +# +# Both functions accept effect definitions in the form +# ``List[List[str]]``. +# This is mostly consistent with how ``sox`` command works, but one caveat is +# that ``sox`` adds some effects automatically, whereas ``torchaudio``’s +# implementation does not. +# +# For the list of available effects, please refer to `the sox +# documentation `__. +# +# **Tip** If you need to load and resample your audio data on the fly, +# then you can use ``torchaudio.sox_effects.apply_effects_file`` with +# effect ``"rate"``. +# +# **Note** ``apply_effects_file`` accepts a file-like object or path-like +# object. Similar to ``torchaudio.load``, when the audio format cannot be +# inferred from either the file extension or header, you can provide +# argument ``format`` to specify the format of the audio source. +# +# **Note** This process is not differentiable. +# + + +# Load the data +waveform1, sample_rate1 = get_sample(resample=16000) + +# Define effects +effects = [ + ["lowpass", "-1", "300"], # apply single-pole lowpass filter + ["speed", "0.8"], # reduce the speed + # This only changes sample rate, so it is necessary to + # add `rate` effect with original sample rate after this. + ["rate", f"{sample_rate1}"], + ["reverb", "-w"], # Reverbration gives some dramatic feeling +] + +# Apply effects +waveform2, sample_rate2 = torchaudio.sox_effects.apply_effects_tensor( + waveform1, sample_rate1, effects) + +plot_waveform(waveform1, sample_rate1, title="Original", xlim=(-.1, 3.2)) +plot_waveform(waveform2, sample_rate2, title="Effects Applied", xlim=(-.1, 3.2)) +print_stats(waveform1, sample_rate=sample_rate1, src="Original") +print_stats(waveform2, sample_rate=sample_rate2, src="Effects Applied") + +###################################################################### +# Note that the number of frames and number of channels are different from +# those of the original after the effects are applied. Let’s listen to the +# audio. Doesn’t it sound more dramatic? +# + +plot_specgram(waveform1, sample_rate1, title="Original", xlim=(0, 3.04)) +play_audio(waveform1, sample_rate1) +plot_specgram(waveform2, sample_rate2, title="Effects Applied", xlim=(0, 3.04)) +play_audio(waveform2, sample_rate2) + + +###################################################################### +# Simulating room reverberation +# ----------------------------- +# +# `Convolution +# reverb `__ is a +# technique that's used to make clean audio sound as though it has been +# produced in a different environment. +# +# Using Room Impulse Response (RIR), for instance, we can make clean speech +# sound as though it has been uttered in a conference room. +# +# For this process, we need RIR data. The following data are from the VOiCES +# dataset, but you can record your own — just turn on your microphone +# and clap your hands. +# + + +sample_rate = 8000 + +rir_raw, _ = get_rir_sample(resample=sample_rate) + +plot_waveform(rir_raw, sample_rate, title="Room Impulse Response (raw)", ylim=None) +plot_specgram(rir_raw, sample_rate, title="Room Impulse Response (raw)") +play_audio(rir_raw, sample_rate) + +###################################################################### +# First, we need to clean up the RIR. We extract the main impulse, normalize +# the signal power, then flip along the time axis. +# + +rir = rir_raw[:, int(sample_rate*1.01):int(sample_rate*1.3)] +rir = rir / torch.norm(rir, p=2) +rir = torch.flip(rir, [1]) + +print_stats(rir) +plot_waveform(rir, sample_rate, title="Room Impulse Response", ylim=None) + +###################################################################### +# Then, we convolve the speech signal with the RIR filter. +# + +speech, _ = get_speech_sample(resample=sample_rate) + +speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0)) +augmented = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0] + +plot_waveform(speech, sample_rate, title="Original", ylim=None) +plot_waveform(augmented, sample_rate, title="RIR Applied", ylim=None) + +plot_specgram(speech, sample_rate, title="Original") +play_audio(speech, sample_rate) + +plot_specgram(augmented, sample_rate, title="RIR Applied") +play_audio(augmented, sample_rate) + + +###################################################################### +# Adding background noise +# ----------------------- +# +# To add background noise to audio data, you can simply add a noise Tensor to +# the Tensor representing the audio data. A common method to adjust the +# intensity of noise is changing the Signal-to-Noise Ratio (SNR). +# [`wikipedia `__] +# +# \begin{align}\mathrm{SNR} = \frac{P_{\mathrm{signal}}}{P_{\mathrm{noise}}}\end{align} +# +# \begin{align}{\mathrm {SNR_{{dB}}}}=10\log _{{10}}\left({\mathrm {SNR}}\right)\end{align} +# + + +sample_rate = 8000 +speech, _ = get_speech_sample(resample=sample_rate) +noise, _ = get_noise_sample(resample=sample_rate) +noise = noise[:, :speech.shape[1]] + +plot_waveform(noise, sample_rate, title="Background noise") +plot_specgram(noise, sample_rate, title="Background noise") +play_audio(noise, sample_rate) + +speech_power = speech.norm(p=2) +noise_power = noise.norm(p=2) + +for snr_db in [20, 10, 3]: + snr = math.exp(snr_db / 10) + scale = snr * noise_power / speech_power + noisy_speech = (scale * speech + noise) / 2 + + plot_waveform(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") + plot_specgram(noisy_speech, sample_rate, title=f"SNR: {snr_db} [dB]") + play_audio(noisy_speech, sample_rate) + +###################################################################### +# Applying codec to Tensor object +# ------------------------------- +# +# ``torchaudio.functional.apply_codec`` can apply codecs to a Tensor object. +# +# **Note** This process is not differentiable. +# + + +waveform, sample_rate = get_speech_sample(resample=8000) + +plot_specgram(waveform, sample_rate, title="Original") +play_audio(waveform, sample_rate) + +configs = [ + ({"format": "wav", "encoding": 'ULAW', "bits_per_sample": 8}, "8 bit mu-law"), + ({"format": "gsm"}, "GSM-FR"), + ({"format": "mp3", "compression": -9}, "MP3"), + ({"format": "vorbis", "compression": -1}, "Vorbis"), +] +for param, title in configs: + augmented = F.apply_codec(waveform, sample_rate, **param) + plot_specgram(augmented, sample_rate, title=title) + play_audio(augmented, sample_rate) + +###################################################################### +# Simulating a phone recoding +# --------------------------- +# +# Combining the previous techniques, we can simulate audio that sounds +# like a person talking over a phone in a echoey room with people talking +# in the background. +# + +sample_rate = 16000 +speech, _ = get_speech_sample(resample=sample_rate) + +plot_specgram(speech, sample_rate, title="Original") +play_audio(speech, sample_rate) + +# Apply RIR +rir, _ = get_rir_sample(resample=sample_rate, processed=True) +speech_ = torch.nn.functional.pad(speech, (rir.shape[1]-1, 0)) +speech = torch.nn.functional.conv1d(speech_[None, ...], rir[None, ...])[0] + +plot_specgram(speech, sample_rate, title="RIR Applied") +play_audio(speech, sample_rate) + +# Add background noise +# Because the noise is recorded in the actual environment, we consider that +# the noise contains the acoustic feature of the environment. Therefore, we add +# the noise after RIR application. +noise, _ = get_noise_sample(resample=sample_rate) +noise = noise[:, :speech.shape[1]] + +snr_db = 8 +scale = math.exp(snr_db / 10) * noise.norm(p=2) / speech.norm(p=2) +speech = (scale * speech + noise) / 2 + +plot_specgram(speech, sample_rate, title="BG noise added") +play_audio(speech, sample_rate) + +# Apply filtering and change sample rate +speech, sample_rate = torchaudio.sox_effects.apply_effects_tensor( + speech, + sample_rate, + effects=[ + ["lowpass", "4000"], + ["compand", "0.02,0.05", "-60,-60,-30,-10,-20,-8,-5,-8,-2,-8", "-8", "-7", "0.05"], + ["rate", "8000"], + ], +) + +plot_specgram(speech, sample_rate, title="Filtered") +play_audio(speech, sample_rate) + +# Apply telephony codec +speech = F.apply_codec(speech, sample_rate, format="gsm") + +plot_specgram(speech, sample_rate, title="GSM Codec Applied") +play_audio(speech, sample_rate) diff --git a/examples/tutorials/audio_datasets_tutorial.py b/examples/tutorials/audio_datasets_tutorial.py new file mode 100644 index 0000000000..51900d3cfb --- /dev/null +++ b/examples/tutorials/audio_datasets_tutorial.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +""" +Audio Datasets +============== + +``torchaudio`` provides easy access to common, publicly accessible +datasets. Please refer to the official documentation for the list of +available datasets. +""" + +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio + +import torch +import torchaudio + +print(torch.__version__) +print(torchaudio.__version__) + +###################################################################### +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- +# + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. + +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- +import multiprocessing +import os + +import matplotlib.pyplot as plt +from IPython.display import Audio, display + + +_SAMPLE_DIR = "_assets" +YESNO_DATASET_PATH = os.path.join(_SAMPLE_DIR, "yes_no") +os.makedirs(YESNO_DATASET_PATH, exist_ok=True) + +def _download_yesno(): + if os.path.exists(os.path.join(YESNO_DATASET_PATH, "waves_yesno.tar.gz")): + return + torchaudio.datasets.YESNO(root=YESNO_DATASET_PATH, download=True) + +YESNO_DOWNLOAD_PROCESS = multiprocessing.Process(target=_download_yesno) +YESNO_DOWNLOAD_PROCESS.start() + +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +###################################################################### +# Here, we show how to use the ``YESNO`` dataset. +# + +YESNO_DOWNLOAD_PROCESS.join() + +dataset = torchaudio.datasets.YESNO(YESNO_DATASET_PATH, download=True) + +for i in [1, 3, 5]: + waveform, sample_rate, label = dataset[i] + plot_specgram(waveform, sample_rate, title=f"Sample {i}: {label}") + play_audio(waveform, sample_rate) diff --git a/examples/tutorials/audio_feature_augmentation_tutorial.py b/examples/tutorials/audio_feature_augmentation_tutorial.py new file mode 100644 index 0000000000..03e1d5fe29 --- /dev/null +++ b/examples/tutorials/audio_feature_augmentation_tutorial.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- +""" +Audio Feature Augmentation +========================== +""" + +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio librosa + +import torch +import torchaudio +import torchaudio.transforms as T + +print(torch.__version__) +print(torchaudio.__version__) + +###################################################################### +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- +# + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. + +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- + +import os +import requests + +import librosa +import matplotlib.pyplot as plt + + +_SAMPLE_DIR = "_assets" + +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") + +os.makedirs(_SAMPLE_DIR, exist_ok=True) + +def _fetch_data(): + uri = [ + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) + +_fetch_data() + +def _get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + +def get_speech_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) + +def get_spectrogram( + n_fft = 400, + win_len = None, + hop_len = None, + power = 2.0, +): + waveform, _ = get_speech_sample() + spectrogram = T.Spectrogram( + n_fft=n_fft, + win_length=win_len, + hop_length=hop_len, + center=True, + pad_mode="reflect", + power=power, + ) + return spectrogram(waveform) + +def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None): + fig, axs = plt.subplots(1, 1) + axs.set_title(title or 'Spectrogram (db)') + axs.set_ylabel(ylabel) + axs.set_xlabel('frame') + im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect) + if xmax: + axs.set_xlim((0, xmax)) + fig.colorbar(im, ax=axs) + plt.show(block=False) + +###################################################################### +# SpecAugment +# ----------- +# +# `SpecAugment `__ +# is a popular spectrogram augmentation technique. +# +# ``torchaudio`` implements ``TimeStretch``, ``TimeMasking`` and +# ``FrequencyMasking``. +# +# TimeStretch +# ~~~~~~~~~~~ +# + +spec = get_spectrogram(power=None) +stretch = T.TimeStretch() + +rate = 1.2 +spec_ = stretch(spec, rate) +plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304) + +plot_spectrogram(torch.abs(spec[0]), title="Original", aspect='equal', xmax=304) + +rate = 0.9 +spec_ = stretch(spec, rate) +plot_spectrogram(torch.abs(spec_[0]), title=f"Stretched x{rate}", aspect='equal', xmax=304) + +###################################################################### +# TimeMasking +# ~~~~~~~~~~~ +# + +torch.random.manual_seed(4) + +spec = get_spectrogram() +plot_spectrogram(spec[0], title="Original") + +masking = T.TimeMasking(time_mask_param=80) +spec = masking(spec) + +plot_spectrogram(spec[0], title="Masked along time axis") + +###################################################################### +# FrequencyMasking +# ~~~~~~~~~~~~~~~~ +# + + +torch.random.manual_seed(4) + +spec = get_spectrogram() +plot_spectrogram(spec[0], title="Original") + +masking = T.FrequencyMasking(freq_mask_param=80) +spec = masking(spec) + +plot_spectrogram(spec[0], title="Masked along frequency axis") diff --git a/examples/tutorials/audio_feature_extractions_tutorial.py b/examples/tutorials/audio_feature_extractions_tutorial.py new file mode 100644 index 0000000000..9715cc0966 --- /dev/null +++ b/examples/tutorials/audio_feature_extractions_tutorial.py @@ -0,0 +1,464 @@ +# -*- coding: utf-8 -*- +""" +Audio Feature Extractions +========================= + +``torchaudio`` implements feature extractions commonly used in the audio +domain. They are available in ``torchaudio.functional`` and +``torchaudio.transforms``. + +``functional`` implements features as standalone functions. +They are stateless. + +``transforms`` implements features as objects, +using implementations from ``functional`` and ``torch.nn.Module``. Because all +transforms are subclasses of ``torch.nn.Module``, they can be serialized +using TorchScript. + +For the complete list of available features, please refer to the +documentation. In this tutorial, we will look into converting between the +time domain and frequency domain (``Spectrogram``, ``GriffinLim``, +``MelSpectrogram``). +""" + +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio librosa + +import torch +import torchaudio +import torchaudio.functional as F +import torchaudio.transforms as T + +print(torch.__version__) +print(torchaudio.__version__) + +###################################################################### +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- +# + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. + +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- + +import os +import requests + +import librosa +import matplotlib.pyplot as plt +from IPython.display import Audio, display + + +_SAMPLE_DIR = "_assets" + +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") + +os.makedirs(_SAMPLE_DIR, exist_ok=True) + + +def _fetch_data(): + uri = [ + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) + +_fetch_data() + +def _get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + +def get_speech_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_SPEECH_PATH, resample=resample) + +def print_stats(waveform, sample_rate=None, src=None): + if src: + print("-" * 10) + print("Source:", src) + print("-" * 10) + if sample_rate: + print("Sample Rate:", sample_rate) + print("Shape:", tuple(waveform.shape)) + print("Dtype:", waveform.dtype) + print(f" - Max: {waveform.max().item():6.3f}") + print(f" - Min: {waveform.min().item():6.3f}") + print(f" - Mean: {waveform.mean().item():6.3f}") + print(f" - Std Dev: {waveform.std().item():6.3f}") + print() + print(waveform) + print() + +def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None): + fig, axs = plt.subplots(1, 1) + axs.set_title(title or 'Spectrogram (db)') + axs.set_ylabel(ylabel) + axs.set_xlabel('frame') + im = axs.imshow(librosa.power_to_db(spec), origin='lower', aspect=aspect) + if xmax: + axs.set_xlim((0, xmax)) + fig.colorbar(im, ax=axs) + plt.show(block=False) + +def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].plot(time_axis, waveform[c], linewidth=1) + axes[c].grid(True) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + if ylim: + axes[c].set_ylim(ylim) + figure.suptitle(title) + plt.show(block=False) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def plot_mel_fbank(fbank, title=None): + fig, axs = plt.subplots(1, 1) + axs.set_title(title or 'Filter bank') + axs.imshow(fbank, aspect='auto') + axs.set_ylabel('frequency bin') + axs.set_xlabel('mel bin') + plt.show(block=False) + +def plot_pitch(waveform, sample_rate, pitch): + figure, axis = plt.subplots(1, 1) + axis.set_title("Pitch Feature") + axis.grid(True) + + end_time = waveform.shape[1] / sample_rate + time_axis = torch.linspace(0, end_time, waveform.shape[1]) + axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3) + + axis2 = axis.twinx() + time_axis = torch.linspace(0, end_time, pitch.shape[1]) + ln2 = axis2.plot( + time_axis, pitch[0], linewidth=2, label='Pitch', color='green') + + axis2.legend(loc=0) + plt.show(block=False) + +def plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc): + figure, axis = plt.subplots(1, 1) + axis.set_title("Kaldi Pitch Feature") + axis.grid(True) + + end_time = waveform.shape[1] / sample_rate + time_axis = torch.linspace(0, end_time, waveform.shape[1]) + axis.plot(time_axis, waveform[0], linewidth=1, color='gray', alpha=0.3) + + time_axis = torch.linspace(0, end_time, pitch.shape[1]) + ln1 = axis.plot(time_axis, pitch[0], linewidth=2, label='Pitch', color='green') + axis.set_ylim((-1.3, 1.3)) + + axis2 = axis.twinx() + time_axis = torch.linspace(0, end_time, nfcc.shape[1]) + ln2 = axis2.plot( + time_axis, nfcc[0], linewidth=2, label='NFCC', color='blue', linestyle='--') + + lns = ln1 + ln2 + labels = [l.get_label() for l in lns] + axis.legend(lns, labels, loc=0) + plt.show(block=False) + +###################################################################### +# Spectrogram +# ----------- +# +# To get the frequency make-up of an audio signal as it varies with time, +# you can use ``Spectrogram``. +# + + + +waveform, sample_rate = get_speech_sample() + +n_fft = 1024 +win_length = None +hop_length = 512 + +# define transformation +spectrogram = T.Spectrogram( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + center=True, + pad_mode="reflect", + power=2.0, +) +# Perform transformation +spec = spectrogram(waveform) + +print_stats(spec) +plot_spectrogram(spec[0], title='torchaudio') + +###################################################################### +# GriffinLim +# ---------- +# +# To recover a waveform from a spectrogram, you can use ``GriffinLim``. +# + + +torch.random.manual_seed(0) +waveform, sample_rate = get_speech_sample() +plot_waveform(waveform, sample_rate, title="Original") +play_audio(waveform, sample_rate) + +n_fft = 1024 +win_length = None +hop_length = 512 + +spec = T.Spectrogram( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, +)(waveform) + +griffin_lim = T.GriffinLim( + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, +) +waveform = griffin_lim(spec) + +plot_waveform(waveform, sample_rate, title="Reconstructed") +play_audio(waveform, sample_rate) + +###################################################################### +# Mel Filter Bank +# --------------- +# +# ``torchaudio.functional.create_fb_matrix`` generates the filter bank +# for converting frequency bins to mel-scale bins. +# +# Since this function does not require input audio/features, there is no +# equivalent transform in ``torchaudio.transforms``. +# + + +n_fft = 256 +n_mels = 64 +sample_rate = 6000 + +mel_filters = F.create_fb_matrix( + int(n_fft // 2 + 1), + n_mels=n_mels, + f_min=0., + f_max=sample_rate/2., + sample_rate=sample_rate, + norm='slaney' +) +plot_mel_fbank(mel_filters, "Mel Filter Bank - torchaudio") + +###################################################################### +# Comparison against librosa +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# For reference, here is the equivalent way to get the mel filter bank +# with ``librosa``. +# + + +mel_filters_librosa = librosa.filters.mel( + sample_rate, + n_fft, + n_mels=n_mels, + fmin=0., + fmax=sample_rate/2., + norm='slaney', + htk=True, +).T + +plot_mel_fbank(mel_filters_librosa, "Mel Filter Bank - librosa") + +mse = torch.square(mel_filters - mel_filters_librosa).mean().item() +print('Mean Square Difference: ', mse) + +###################################################################### +# MelSpectrogram +# -------------- +# +# Generating a mel-scale spectrogram involves generating a spectrogram +# and performing mel-scale conversion. In ``torchaudio``, ``MelSpectrogram`` provides +# this functionality. +# + + +waveform, sample_rate = get_speech_sample() + +n_fft = 1024 +win_length = None +hop_length = 512 +n_mels = 128 + +mel_spectrogram = T.MelSpectrogram( + sample_rate=sample_rate, + n_fft=n_fft, + win_length=win_length, + hop_length=hop_length, + center=True, + pad_mode="reflect", + power=2.0, + norm='slaney', + onesided=True, + n_mels=n_mels, + mel_scale="htk", +) + +melspec = mel_spectrogram(waveform) +plot_spectrogram( + melspec[0], title="MelSpectrogram - torchaudio", ylabel='mel freq') + +###################################################################### +# Comparison against librosa +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# For reference, here is the equivalent means of generating mel-scale +# spectrograms with ``librosa``. +# + + +melspec_librosa = librosa.feature.melspectrogram( + waveform.numpy()[0], + sr=sample_rate, + n_fft=n_fft, + hop_length=hop_length, + win_length=win_length, + center=True, + pad_mode="reflect", + power=2.0, + n_mels=n_mels, + norm='slaney', + htk=True, +) +plot_spectrogram( + melspec_librosa, title="MelSpectrogram - librosa", ylabel='mel freq') + +mse = torch.square(melspec - melspec_librosa).mean().item() +print('Mean Square Difference: ', mse) + +###################################################################### +# MFCC +# ---- +# + +waveform, sample_rate = get_speech_sample() + +n_fft = 2048 +win_length = None +hop_length = 512 +n_mels = 256 +n_mfcc = 256 + +mfcc_transform = T.MFCC( + sample_rate=sample_rate, + n_mfcc=n_mfcc, + melkwargs={ + 'n_fft': n_fft, + 'n_mels': n_mels, + 'hop_length': hop_length, + 'mel_scale': 'htk', + } +) + +mfcc = mfcc_transform(waveform) + +plot_spectrogram(mfcc[0]) + +###################################################################### +# Comparing against librosa +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# + + +melspec = librosa.feature.melspectrogram( + y=waveform.numpy()[0], sr=sample_rate, n_fft=n_fft, + win_length=win_length, hop_length=hop_length, + n_mels=n_mels, htk=True, norm=None) + +mfcc_librosa = librosa.feature.mfcc( + S=librosa.core.spectrum.power_to_db(melspec), + n_mfcc=n_mfcc, dct_type=2, norm='ortho') + +plot_spectrogram(mfcc_librosa) + +mse = torch.square(mfcc - mfcc_librosa).mean().item() +print('Mean Square Difference: ', mse) + +###################################################################### +# Pitch +# ----- +# + + +waveform, sample_rate = get_speech_sample() + +pitch = F.detect_pitch_frequency(waveform, sample_rate) +plot_pitch(waveform, sample_rate, pitch) +play_audio(waveform, sample_rate) + +###################################################################### +# Kaldi Pitch (beta) +# ------------------ +# +# Kaldi Pitch feature [1] is a pitch detection mechanism tuned for automatic +# speech recognition (ASR) applications. This is a beta feature in ``torchaudio``, +# and it is available only in ``functional``. +# +# 1. A pitch extraction algorithm tuned for automatic speech recognition +# +# Ghahremani, B. BabaAli, D. Povey, K. Riedhammer, J. Trmal and S. +# Khudanpur +# +# 2014 IEEE International Conference on Acoustics, Speech and Signal +# Processing (ICASSP), Florence, 2014, pp. 2494-2498, doi: +# 10.1109/ICASSP.2014.6854049. +# [`abstract `__], +# [`paper `__] +# + + +waveform, sample_rate = get_speech_sample(resample=16000) + +pitch_feature = F.compute_kaldi_pitch(waveform, sample_rate) +pitch, nfcc = pitch_feature[..., 0], pitch_feature[..., 1] + +plot_kaldi_pitch(waveform, sample_rate, pitch, nfcc) +play_audio(waveform, sample_rate) diff --git a/examples/tutorials/audio_io_tutorial.py b/examples/tutorials/audio_io_tutorial.py new file mode 100644 index 0000000000..eb52e07e62 --- /dev/null +++ b/examples/tutorials/audio_io_tutorial.py @@ -0,0 +1,438 @@ +# -*- coding: utf-8 -*- +""" +Audio I/O +========= + +``torchaudio`` integrates ``libsox`` and provides a rich set of audio I/O. +""" + +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio boto3 + +import torch +import torchaudio + +print(torch.__version__) +print(torchaudio.__version__) + +###################################################################### +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- +# + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. +#@markdown +#@markdown In this tutorial, we will use a speech data from [VOiCES dataset](https://iqtlabs.github.io/voices/), which is licensed under Creative Commos BY 4.0. + + +import io +import os +import requests +import tarfile + +import boto3 +from botocore import UNSIGNED +from botocore.config import Config +import matplotlib.pyplot as plt +from IPython.display import Audio, display + + +_SAMPLE_DIR = "_assets" +SAMPLE_WAV_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.wav" +SAMPLE_WAV_PATH = os.path.join(_SAMPLE_DIR, "steam.wav") + +SAMPLE_MP3_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.mp3" +SAMPLE_MP3_PATH = os.path.join(_SAMPLE_DIR, "steam.mp3") + +SAMPLE_GSM_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/steam-train-whistle-daniel_simon.gsm" +SAMPLE_GSM_PATH = os.path.join(_SAMPLE_DIR, "steam.gsm") + +SAMPLE_WAV_SPEECH_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" +SAMPLE_WAV_SPEECH_PATH = os.path.join(_SAMPLE_DIR, "speech.wav") + +SAMPLE_TAR_URL = "https://pytorch-tutorial-assets.s3.amazonaws.com/VOiCES_devkit.tar.gz" +SAMPLE_TAR_PATH = os.path.join(_SAMPLE_DIR, "sample.tar.gz") +SAMPLE_TAR_ITEM = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" + +S3_BUCKET = "pytorch-tutorial-assets" +S3_KEY = "VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav" + +def _fetch_data(): + os.makedirs(_SAMPLE_DIR, exist_ok=True) + uri = [ + (SAMPLE_WAV_URL, SAMPLE_WAV_PATH), + (SAMPLE_MP3_URL, SAMPLE_MP3_PATH), + (SAMPLE_GSM_URL, SAMPLE_GSM_PATH), + (SAMPLE_WAV_SPEECH_URL, SAMPLE_WAV_SPEECH_PATH), + (SAMPLE_TAR_URL, SAMPLE_TAR_PATH), + ] + for url, path in uri: + with open(path, 'wb') as file_: + file_.write(requests.get(url).content) + +_fetch_data() + +def print_stats(waveform, sample_rate=None, src=None): + if src: + print("-" * 10) + print("Source:", src) + print("-" * 10) + if sample_rate: + print("Sample Rate:", sample_rate) + print("Shape:", tuple(waveform.shape)) + print("Dtype:", waveform.dtype) + print(f" - Max: {waveform.max().item():6.3f}") + print(f" - Min: {waveform.min().item():6.3f}") + print(f" - Mean: {waveform.mean().item():6.3f}") + print(f" - Std Dev: {waveform.std().item():6.3f}") + print() + print(waveform) + print() + +def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].plot(time_axis, waveform[c], linewidth=1) + axes[c].grid(True) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + if ylim: + axes[c].set_ylim(ylim) + figure.suptitle(title) + plt.show(block=False) + +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def _get_sample(path, resample=None): + effects = [ + ["remix", "1"] + ] + if resample: + effects.extend([ + ["lowpass", f"{resample // 2}"], + ["rate", f'{resample}'], + ]) + return torchaudio.sox_effects.apply_effects_file(path, effects=effects) + +def get_sample(*, resample=None): + return _get_sample(SAMPLE_WAV_PATH, resample=resample) + +def inspect_file(path): + print("-" * 10) + print("Source:", path) + print("-" * 10) + print(f" - File size: {os.path.getsize(path)} bytes") + print(f" - {torchaudio.info(path)}") + +###################################################################### +# Quering audio metadata +# ---------------------- +# +# Function ``torchaudio.info`` fetches audio metadata. You can provide +# a path-like object or file-like object. +# + +metadata = torchaudio.info(SAMPLE_WAV_PATH) +print(metadata) + +###################################################################### +# Where +# +# - ``sample_rate`` is the sampling rate of the audio +# - ``num_channels`` is the number of channels +# - ``num_frames`` is the number of frames per channel +# - ``bits_per_sample`` is bit depth +# - ``encoding`` is the sample coding format +# +# ``encoding`` can take on one of the following values: +# +# - ``"PCM_S"``: Signed integer linear PCM +# - ``"PCM_U"``: Unsigned integer linear PCM +# - ``"PCM_F"``: Floating point linear PCM +# - ``"FLAC"``: Flac, `Free Lossless Audio +# Codec `__ +# - ``"ULAW"``: Mu-law, +# [`wikipedia `__] +# - ``"ALAW"``: A-law +# [`wikipedia `__] +# - ``"MP3"`` : MP3, MPEG-1 Audio Layer III +# - ``"VORBIS"``: OGG Vorbis [`xiph.org `__] +# - ``"AMR_NB"``: Adaptive Multi-Rate +# [`wikipedia `__] +# - ``"AMR_WB"``: Adaptive Multi-Rate Wideband +# [`wikipedia `__] +# - ``"OPUS"``: Opus [`opus-codec.org `__] +# - ``"GSM"``: GSM-FR +# [`wikipedia `__] +# - ``"UNKNOWN"`` None of above +# + +###################################################################### +# **Note** +# +# - ``bits_per_sample`` can be ``0`` for formats with compression and/or +# variable bit rate (such as MP3). +# - ``num_frames`` can be ``0`` for GSM-FR format. +# + +metadata = torchaudio.info(SAMPLE_MP3_PATH) +print(metadata) + +metadata = torchaudio.info(SAMPLE_GSM_PATH) +print(metadata) + + +###################################################################### +# Querying file-like object +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# ``info`` works on file-like objects. +# + +print("Source:", SAMPLE_WAV_URL) +with requests.get(SAMPLE_WAV_URL, stream=True) as response: + metadata = torchaudio.info(response.raw) +print(metadata) + +###################################################################### +# **Note** When passing a file-like object, ``info`` does not read +# all of the underlying data; rather, it reads only a portion +# of the data from the beginning. +# Therefore, for a given audio format, it may not be able to retrieve the +# correct metadata, including the format itself. +# The following example illustrates this. +# +# - Use argument ``format`` to specify the audio format of the input. +# - The returned metadata has ``num_frames = 0`` +# + +print("Source:", SAMPLE_MP3_URL) +with requests.get(SAMPLE_MP3_URL, stream=True) as response: + metadata = torchaudio.info(response.raw, format="mp3") + + print(f"Fetched {response.raw.tell()} bytes.") +print(metadata) + +###################################################################### +# Loading audio data into Tensor +# ------------------------------ +# +# To load audio data, you can use ``torchaudio.load``. +# +# This function accepts a path-like object or file-like object as input. +# +# The returned value is a tuple of waveform (``Tensor``) and sample rate +# (``int``). +# +# By default, the resulting tensor object has ``dtype=torch.float32`` and +# its value range is normalized within ``[-1.0, 1.0]``. +# +# For the list of supported format, please refer to `the torchaudio +# documentation `__. +# + +waveform, sample_rate = torchaudio.load(SAMPLE_WAV_SPEECH_PATH) + +print_stats(waveform, sample_rate=sample_rate) +plot_waveform(waveform, sample_rate) +plot_specgram(waveform, sample_rate) +play_audio(waveform, sample_rate) + + +###################################################################### +# Loading from file-like object +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# ``torchaudio``\ ’s I/O functions now support file-like objects. This +# allows for fetching and decoding audio data from locations +# within and beyond the local file system. +# The following examples illustrate this. +# + +# Load audio data as HTTP request +with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response: + waveform, sample_rate = torchaudio.load(response.raw) +plot_specgram(waveform, sample_rate, title="HTTP datasource") + +# Load audio from tar file +with tarfile.open(SAMPLE_TAR_PATH, mode='r') as tarfile_: + fileobj = tarfile_.extractfile(SAMPLE_TAR_ITEM) + waveform, sample_rate = torchaudio.load(fileobj) +plot_specgram(waveform, sample_rate, title="TAR file") + +# Load audio from S3 +client = boto3.client('s3', config=Config(signature_version=UNSIGNED)) +response = client.get_object(Bucket=S3_BUCKET, Key=S3_KEY) +waveform, sample_rate = torchaudio.load(response['Body']) +plot_specgram(waveform, sample_rate, title="From S3") + + +###################################################################### +# Tips on slicing +# ~~~~~~~~~~~~~~~ +# +# Providing ``num_frames`` and ``frame_offset`` arguments restricts +# decoding to the corresponding segment of the input. +# +# The same result can be achieved using vanilla Tensor slicing, +# (i.e. ``waveform[:, frame_offset:frame_offset+num_frames]``). However, +# providing ``num_frames`` and ``frame_offset`` arguments is more +# efficient. +# +# This is because the function will end data acquisition and decoding +# once it finishes decoding the requested frames. This is advantageous +# when the audio data are transferred via network as the data transfer will +# stop as soon as the necessary amount of data is fetched. +# +# The following example illustrates this. +# + +# Illustration of two different decoding methods. +# The first one will fetch all the data and decode them, while +# the second one will stop fetching data once it completes decoding. +# The resulting waveforms are identical. + +frame_offset, num_frames = 16000, 16000 # Fetch and decode the 1 - 2 seconds + +print("Fetching all the data...") +with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response: + waveform1, sample_rate1 = torchaudio.load(response.raw) + waveform1 = waveform1[:, frame_offset:frame_offset+num_frames] + print(f" - Fetched {response.raw.tell()} bytes") + +print("Fetching until the requested frames are available...") +with requests.get(SAMPLE_WAV_SPEECH_URL, stream=True) as response: + waveform2, sample_rate2 = torchaudio.load( + response.raw, frame_offset=frame_offset, num_frames=num_frames) + print(f" - Fetched {response.raw.tell()} bytes") + +print("Checking the resulting waveform ... ", end="") +assert (waveform1 == waveform2).all() +print("matched!") + + +###################################################################### +# Saving audio to file +# -------------------- +# +# To save audio data in formats interpretable by common applications, +# you can use ``torchaudio.save``. +# +# This function accepts a path-like object or file-like object. +# +# When passing a file-like object, you also need to provide argument ``format`` +# so that the function knows which format it should use. In the +# case of a path-like object, the function will infer the format from +# the extension. If you are saving to a file without an extension, you need +# to provide argument ``format``. +# +# When saving WAV-formatted data, the default encoding for ``float32`` Tensor +# is 32-bit floating-point PCM. You can provide arguments ``encoding`` and +# ``bits_per_sample`` to change this behavior. For example, to save data +# in 16-bit signed integer PCM, you can do the following. +# +# **Note** Saving data in encodings with lower bit depth reduces the +# resulting file size but also precision. +# + + +waveform, sample_rate = get_sample() +print_stats(waveform, sample_rate=sample_rate) + +# Save without any encoding option. +# The function will pick up the encoding which +# the provided data fit +path = f"{_SAMPLE_DIR}/save_example_default.wav" +torchaudio.save(path, waveform, sample_rate) +inspect_file(path) + +# Save as 16-bit signed integer Linear PCM +# The resulting file occupies half the storage but loses precision +path = f"{_SAMPLE_DIR}/save_example_PCM_S16.wav" +torchaudio.save( + path, waveform, sample_rate, + encoding="PCM_S", bits_per_sample=16) +inspect_file(path) + + +###################################################################### +# ``torchaudio.save`` can also handle other formats. To name a few: +# + +waveform, sample_rate = get_sample(resample=8000) + +formats = [ + "mp3", + "flac", + "vorbis", + "sph", + "amb", + "amr-nb", + "gsm", +] + +for format in formats: + path = f"{_SAMPLE_DIR}/save_example.{format}" + torchaudio.save(path, waveform, sample_rate, format=format) + inspect_file(path) + + +###################################################################### +# Saving to file-like object +# ~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Similar to the other I/O functions, you can save audio to file-like +# objects. When saving to a file-like object, argument ``format`` is +# required. +# + + +waveform, sample_rate = get_sample() + +# Saving to bytes buffer +buffer_ = io.BytesIO() +torchaudio.save(buffer_, waveform, sample_rate, format="wav") + +buffer_.seek(0) +print(buffer_.read(16)) + diff --git a/examples/tutorials/audio_resampling_tutorial.py b/examples/tutorials/audio_resampling_tutorial.py new file mode 100644 index 0000000000..9e710ba519 --- /dev/null +++ b/examples/tutorials/audio_resampling_tutorial.py @@ -0,0 +1,441 @@ +# -*- coding: utf-8 -*- +""" +Audio Resampling +================ + +Here, we will walk through resampling audio waveforms using ``torchaudio``. + +""" + +# When running this tutorial in Google Colab, install the required packages +# with the following. +# !pip install torchaudio librosa + +import torch +import torchaudio +import torchaudio.functional as F +import torchaudio.transforms as T + +print(torch.__version__) +print(torchaudio.__version__) + +###################################################################### +# Preparing data and utility functions (skip this section) +# -------------------------------------------------------- +# + +#@title Prepare data and utility functions. {display-mode: "form"} +#@markdown +#@markdown You do not need to look into this cell. +#@markdown Just execute once and you are good to go. + +#------------------------------------------------------------------------------- +# Preparation of data and helper functions. +#------------------------------------------------------------------------------- + +import math +import time + +import librosa +import matplotlib.pyplot as plt +from IPython.display import Audio, display +import pandas as pd + + +DEFAULT_OFFSET = 201 +SWEEP_MAX_SAMPLE_RATE = 48000 +DEFAULT_LOWPASS_FILTER_WIDTH = 6 +DEFAULT_ROLLOFF = 0.99 +DEFAULT_RESAMPLING_METHOD = 'sinc_interpolation' + + +def _get_log_freq(sample_rate, max_sweep_rate, offset): + """Get freqs evenly spaced out in log-scale, between [0, max_sweep_rate // 2] + + offset is used to avoid negative infinity `log(offset + x)`. + + """ + half = sample_rate // 2 + start, stop = math.log(offset), math.log(offset + max_sweep_rate // 2) + return torch.exp(torch.linspace(start, stop, sample_rate, dtype=torch.double)) - offset + +def _get_inverse_log_freq(freq, sample_rate, offset): + """Find the time where the given frequency is given by _get_log_freq""" + half = sample_rate // 2 + return sample_rate * (math.log(1 + freq / offset) / math.log(1 + half / offset)) + +def _get_freq_ticks(sample_rate, offset, f_max): + # Given the original sample rate used for generating the sweep, + # find the x-axis value where the log-scale major frequency values fall in + time, freq = [], [] + for exp in range(2, 5): + for v in range(1, 10): + f = v * 10 ** exp + if f < sample_rate // 2: + t = _get_inverse_log_freq(f, sample_rate, offset) / sample_rate + time.append(t) + freq.append(f) + t_max = _get_inverse_log_freq(f_max, sample_rate, offset) / sample_rate + time.append(t_max) + freq.append(f_max) + return time, freq + +def get_sine_sweep(sample_rate, offset=DEFAULT_OFFSET): + max_sweep_rate = sample_rate + freq = _get_log_freq(sample_rate, max_sweep_rate, offset) + delta = 2 * math.pi * freq / sample_rate + cummulative = torch.cumsum(delta, dim=0) + signal = torch.sin(cummulative).unsqueeze(dim=0) + return signal + +def plot_sweep(waveform, sample_rate, title, max_sweep_rate=SWEEP_MAX_SAMPLE_RATE, offset=DEFAULT_OFFSET): + x_ticks = [100, 500, 1000, 5000, 10000, 20000, max_sweep_rate // 2] + y_ticks = [1000, 5000, 10000, 20000, sample_rate//2] + + time, freq = _get_freq_ticks(max_sweep_rate, offset, sample_rate // 2) + freq_x = [f if f in x_ticks and f <= max_sweep_rate // 2 else None for f in freq] + freq_y = [f for f in freq if f >= 1000 and f in y_ticks and f <= sample_rate // 2] + + figure, axis = plt.subplots(1, 1) + axis.specgram(waveform[0].numpy(), Fs=sample_rate) + plt.xticks(time, freq_x) + plt.yticks(freq_y, freq_y) + axis.set_xlabel('Original Signal Frequency (Hz, log scale)') + axis.set_ylabel('Waveform Frequency (Hz)') + axis.xaxis.grid(True, alpha=0.67) + axis.yaxis.grid(True, alpha=0.67) + figure.suptitle(f'{title} (sample rate: {sample_rate} Hz)') + plt.show(block=True) + +def play_audio(waveform, sample_rate): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + if num_channels == 1: + display(Audio(waveform[0], rate=sample_rate)) + elif num_channels == 2: + display(Audio((waveform[0], waveform[1]), rate=sample_rate)) + else: + raise ValueError("Waveform with more than 2 channels are not supported.") + +def plot_specgram(waveform, sample_rate, title="Spectrogram", xlim=None): + waveform = waveform.numpy() + + num_channels, num_frames = waveform.shape + time_axis = torch.arange(0, num_frames) / sample_rate + + figure, axes = plt.subplots(num_channels, 1) + if num_channels == 1: + axes = [axes] + for c in range(num_channels): + axes[c].specgram(waveform[c], Fs=sample_rate) + if num_channels > 1: + axes[c].set_ylabel(f'Channel {c+1}') + if xlim: + axes[c].set_xlim(xlim) + figure.suptitle(title) + plt.show(block=False) + +def benchmark_resample( + method, + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=DEFAULT_LOWPASS_FILTER_WIDTH, + rolloff=DEFAULT_ROLLOFF, + resampling_method=DEFAULT_RESAMPLING_METHOD, + beta=None, + librosa_type=None, + iters=5 +): + if method == "functional": + begin = time.time() + for _ in range(iters): + F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, + rolloff=rolloff, resampling_method=resampling_method) + elapsed = time.time() - begin + return elapsed / iters + elif method == "transforms": + resampler = T.Resample(sample_rate, resample_rate, lowpass_filter_width=lowpass_filter_width, + rolloff=rolloff, resampling_method=resampling_method, dtype=waveform.dtype) + begin = time.time() + for _ in range(iters): + resampler(waveform) + elapsed = time.time() - begin + return elapsed / iters + elif method == "librosa": + waveform_np = waveform.squeeze().numpy() + begin = time.time() + for _ in range(iters): + librosa.resample(waveform_np, sample_rate, resample_rate, res_type=librosa_type) + elapsed = time.time() - begin + return elapsed / iters + +###################################################################### +# To resample an audio waveform from one freqeuncy to another, you can use +# ``transforms.Resample`` or ``functional.resample``. +# ``transforms.Resample`` precomputes and caches the kernel used for +# resampling, while ``functional.resample`` computes it on the fly, so +# using ``transforms.Resample`` will result in a speedup when resampling +# multiple waveforms using the same parameters (see Benchmarking section). +# +# Both resampling methods use `bandlimited sinc +# interpolation `__ to compute +# signal values at arbitrary time steps. The implementation involves +# convolution, so we can take advantage of GPU / multithreading for +# performance improvements. When using resampling in multiple +# subprocesses, such as data loading with multiple worker processes, your +# application might create more threads than your system can handle +# efficiently. Setting ``torch.set_num_threads(1)`` might help in this +# case. +# +# Because a finite number of samples can only represent a finite number of +# frequencies, resampling does not produce perfect results, and a variety +# of parameters can be used to control for its quality and computational +# speed. We demonstrate these properties through resampling a logarithmic +# sine sweep, which is a sine wave that increases exponentially in +# frequency over time. +# +# The spectrograms below show the frequency representation of the signal, +# where the x-axis corresponds to the frequency of the original +# waveform (in log scale), y-axis the frequency of the +# plotted waveform, and color intensity the amplitude. +# + +sample_rate = 48000 +resample_rate = 32000 + +waveform = get_sine_sweep(sample_rate) +plot_sweep(waveform, sample_rate, title="Original Waveform") +play_audio(waveform, sample_rate) + +resampler = T.Resample(sample_rate, resample_rate, dtype=waveform.dtype) +resampled_waveform = resampler(waveform) +plot_sweep(resampled_waveform, resample_rate, title="Resampled Waveform") +play_audio(waveform, sample_rate) + + +###################################################################### +# Controling resampling quality with parameters +# --------------------------------------------- +# +# Lowpass filter width +# ~~~~~~~~~~~~~~~~~~~~ +# +# Because the filter used for interpolation extends infinitely, the +# ``lowpass_filter_width`` parameter is used to control for the width of +# the filter to use to window the interpolation. It is also referred to as +# the number of zero crossings, since the interpolation passes through +# zero at every time unit. Using a larger ``lowpass_filter_width`` +# provides a sharper, more precise filter, but is more computationally +# expensive. +# + + +sample_rate = 48000 +resample_rate = 32000 + +resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=6) +plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=6") + +resampled_waveform = F.resample(waveform, sample_rate, resample_rate, lowpass_filter_width=128) +plot_sweep(resampled_waveform, resample_rate, title="lowpass_filter_width=128") + + +###################################################################### +# Rolloff +# ~~~~~~~ +# +# The ``rolloff`` parameter is represented as a fraction of the Nyquist +# frequency, which is the maximal frequency representable by a given +# finite sample rate. ``rolloff`` determines the lowpass filter cutoff and +# controls the degree of aliasing, which takes place when frequencies +# higher than the Nyquist are mapped to lower frequencies. A lower rolloff +# will therefore reduce the amount of aliasing, but it will also reduce +# some of the higher frequencies. +# + + +sample_rate = 48000 +resample_rate = 32000 + +resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.99) +plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.99") + +resampled_waveform = F.resample(waveform, sample_rate, resample_rate, rolloff=0.8) +plot_sweep(resampled_waveform, resample_rate, title="rolloff=0.8") + + +###################################################################### +# Window function +# ~~~~~~~~~~~~~~~ +# +# By default, ``torchaudio``’s resample uses the Hann window filter, which is +# a weighted cosine function. It additionally supports the Kaiser window, +# which is a near optimal window function that contains an additional +# ``beta`` parameter that allows for the design of the smoothness of the +# filter and width of impulse. This can be controlled using the +# ``resampling_method`` parameter. +# + + +sample_rate = 48000 +resample_rate = 32000 + +resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="sinc_interpolation") +plot_sweep(resampled_waveform, resample_rate, title="Hann Window Default") + +resampled_waveform = F.resample(waveform, sample_rate, resample_rate, resampling_method="kaiser_window") +plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Default") + + +###################################################################### +# Comparison against librosa +# -------------------------- +# +# ``torchaudio``’s resample function can be used to produce results similar to +# that of librosa (resampy)’s kaiser window resampling, with some noise +# + + +sample_rate = 48000 +resample_rate = 32000 + +### kaiser_best +resampled_waveform = F.resample( + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492 +) +plot_sweep(resampled_waveform, resample_rate, title="Kaiser Window Best (torchaudio)") + +librosa_resampled_waveform = torch.from_numpy( + librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_best')).unsqueeze(0) +plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Best (librosa)") + +mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item() +print("torchaudio and librosa kaiser best MSE:", mse) + +### kaiser_fast +resampled_waveform = F.resample( + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=16, + rolloff=0.85, + resampling_method="kaiser_window", + beta=8.555504641634386 +) +plot_specgram(resampled_waveform, resample_rate, title="Kaiser Window Fast (torchaudio)") + +librosa_resampled_waveform = torch.from_numpy( + librosa.resample(waveform.squeeze().numpy(), sample_rate, resample_rate, res_type='kaiser_fast')).unsqueeze(0) +plot_sweep(librosa_resampled_waveform, resample_rate, title="Kaiser Window Fast (librosa)") + +mse = torch.square(resampled_waveform - librosa_resampled_waveform).mean().item() +print("torchaudio and librosa kaiser fast MSE:", mse) + + +###################################################################### +# Performance Benchmarking +# ------------------------ +# +# Below are benchmarks for downsampling and upsampling waveforms between +# two pairs of sampling rates. We demonstrate the performance implications +# that the ``lowpass_filter_wdith``, window type, and sample rates can +# have. Additionally, we provide a comparison against ``librosa``\ ’s +# ``kaiser_best`` and ``kaiser_fast`` using their corresponding parameters +# in ``torchaudio``. +# +# To elaborate on the results: +# +# - a larger ``lowpass_filter_width`` results in a larger resampling kernel, +# and therefore increases computation time for both the kernel computation +# and convolution +# - using ``kaiser_window`` results in longer computation times than the default +# ``sinc_interpolation`` because it is more complex to compute the intermediate +# window values - a large GCD between the sample and resample rate will result +# in a simplification that allows for a smaller kernel and faster kernel computation. +# + + +configs = { + "downsample (48 -> 44.1 kHz)": [48000, 44100], + "downsample (16 -> 8 kHz)": [16000, 8000], + "upsample (44.1 -> 48 kHz)": [44100, 48000], + "upsample (8 -> 16 kHz)": [8000, 16000], +} + +for label in configs: + times, rows = [], [] + sample_rate = configs[label][0] + resample_rate = configs[label][1] + waveform = get_sine_sweep(sample_rate) + + # sinc 64 zero-crossings + f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=64) + t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=64) + times.append([None, 1000 * f_time, 1000 * t_time]) + rows.append(f"sinc (width 64)") + + # sinc 6 zero-crossings + f_time = benchmark_resample("functional", waveform, sample_rate, resample_rate, lowpass_filter_width=16) + t_time = benchmark_resample("transforms", waveform, sample_rate, resample_rate, lowpass_filter_width=16) + times.append([None, 1000 * f_time, 1000 * t_time]) + rows.append(f"sinc (width 16)") + + # kaiser best + lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_best") + f_time = benchmark_resample( + "functional", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492) + t_time = benchmark_resample( + "transforms", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=64, + rolloff=0.9475937167399596, + resampling_method="kaiser_window", + beta=14.769656459379492) + times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) + rows.append(f"kaiser_best") + + # kaiser fast + lib_time = benchmark_resample("librosa", waveform, sample_rate, resample_rate, librosa_type="kaiser_fast") + f_time = benchmark_resample( + "functional", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=16, + rolloff=0.85, + resampling_method="kaiser_window", + beta=8.555504641634386) + t_time = benchmark_resample( + "transforms", + waveform, + sample_rate, + resample_rate, + lowpass_filter_width=16, + rolloff=0.85, + resampling_method="kaiser_window", + beta=8.555504641634386) + times.append([1000 * lib_time, 1000 * f_time, 1000 * t_time]) + rows.append(f"kaiser_fast") + + df = pd.DataFrame(times, + columns=["librosa", "functional", "transforms"], + index=rows) + df.columns = pd.MultiIndex.from_product([[f"{label} time (ms)"],df.columns]) + display(df.round(2))