Source code for torchaudio
-from __future__ import division, print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
import os.path
import torch
diff --git a/_modules/torchaudio/compliance/kaldi.html b/_modules/torchaudio/compliance/kaldi.html
index a9875c3b2b..41051037e8 100644
--- a/_modules/torchaudio/compliance/kaldi.html
+++ b/_modules/torchaudio/compliance/kaldi.html
@@ -215,19 +215,22 @@
Source code for torchaudio.compliance.kaldi
-import math
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
+import fractions
import random
import torch
-
+import torchaudio
__all__ = [
- 'fbank',
'get_mel_banks',
'inverse_mel_scale',
'inverse_mel_scale_scalar',
'mel_scale',
'mel_scale_scalar',
'spectrogram',
+ 'fbank',
+ 'mfcc',
'vtln_warp_freq',
'vtln_warp_mel_freq',
'resample_waveform',
@@ -332,7 +335,9 @@ Source code for torchaudio.compliance.kaldi
frame_length, round_to_power_of_two, preemphasis_coefficient):
r"""Gets the waveform and window properties
"""
- waveform = waveform[max(channel, 0), :] # size (n)
+ channel = max(channel, 0)
+ assert channel < waveform.size(0), ('Invalid channel %d for size %d' % (channel, waveform.size(0)))
+ waveform = waveform[channel, :] # size (n)
window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
@@ -397,6 +402,15 @@ Source code for torchaudio.compliance.kaldi
return strided_input, signal_log_energy
+def _subtract_column_mean(tensor, subtract_mean):
+ # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
+ # it returns size (m, n)
+ if subtract_mean:
+ col_means = torch.mean(tensor, dim=0).unsqueeze(0)
+ tensor = tensor - col_means
+ return tensor
+
+
[docs]def spectrogram(
waveform, blackman_coeff=0.42, channel=-1, dither=1.0, energy_floor=0.0,
frame_length=25.0, frame_shift=10.0, min_duration=0.0,
@@ -454,10 +468,7 @@ Source code for torchaudio.compliance.kaldi
power_spectrum = torch.max(fft.pow(2).sum(2), EPSILON).log() # size (m, padded_window_size // 2 + 1)
power_spectrum[:, 0] = signal_log_energy
- if subtract_mean:
- col_means = torch.mean(power_spectrum, dim=0).unsqueeze(0) # size (1, padded_window_size // 2 + 1)
- power_spectrum = power_spectrum - col_means
-
+ power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
return power_spectrum
@@ -719,7 +730,7 @@ Source code for torchaudio.compliance.kaldi
# avoid log of zero (which should be prevented anyway by dithering)
mel_energies = torch.max(mel_energies, EPSILON).log()
- # if use_energy then add it as the first column for htk_compat == true else last column
+ # if use_energy then add it as the last column for htk_compat == true else first column
if use_energy:
signal_log_energy = signal_log_energy.unsqueeze(1) # size (m, 1)
# returns size (m, num_mel_bins + 1)
@@ -728,13 +739,134 @@ Source code for torchaudio.compliance.kaldi
else:
mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
- if subtract_mean:
- col_means = torch.mean(mel_energies, dim=0).unsqueeze(0) # size (1, num_mel_bins + use_energy)
- mel_energies = mel_energies - col_means
-
+ mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
return mel_energies
+def _get_dct_matrix(num_ceps, num_mel_bins):
+ # returns a dct matrix of size (num_mel_bins, num_ceps)
+ # size (num_mel_bins, num_mel_bins)
+ dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, 'ortho')
+ # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
+ # this would be the first column in the dct_matrix for torchaudio as it expects a
+ # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
+ # expects a left multiply e.g. dct_matrix * vector).
+ dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
+ dct_matrix = dct_matrix[:, :num_ceps]
+ return dct_matrix
+
+
+def _get_lifter_coeffs(num_ceps, cepstral_lifter):
+ # returns size (num_ceps)
+ # Compute liftering coefficients (scaling on cepstral coeffs)
+ # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
+ i = torch.arange(num_ceps, dtype=torch.get_default_dtype())
+ return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
+
+
+[docs]def mfcc(
+ waveform, blackman_coeff=0.42, cepstral_lifter=22.0, channel=-1, dither=1.0,
+ energy_floor=0.0, frame_length=25.0, frame_shift=10.0, high_freq=0.0, htk_compat=False,
+ low_freq=20.0, num_ceps=13, min_duration=0.0, num_mel_bins=23, preemphasis_coefficient=0.97,
+ raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True,
+ sample_frequency=16000.0, snip_edges=True, subtract_mean=False, use_energy=False,
+ vtln_high=-500.0, vtln_low=100.0, vtln_warp=1.0, window_type=POVEY):
+ r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
+ compute-mfcc-feats.
+
+ Args:
+ waveform (torch.Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+ blackman_coeff (float): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+ cepstral_lifter (float): Constant that controls scaling of MFCCs (Default: ``22.0``)
+ channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+ dither (float): Dithering constant (0.0 means no dither). If you turn this off, you should set
+ the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``1.0``)
+ energy_floor (float): Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
+ this floor is applied to the zeroth component, representing the total signal energy. The floor on the
+ individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``0.0``)
+ frame_length (float): Frame length in milliseconds (Default: ``25.0``)
+ frame_shift (float): Frame shift in milliseconds (Default: ``10.0``)
+ high_freq (float): High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (Default: ``0.0``)
+ htk_compat (bool): If true, put energy last. Warning: not sufficient to get HTK compatible features (need
+ to change other parameters). (Default: ``False``)
+ low_freq (float): Low cutoff frequency for mel bins (Default: ``20.0``)
+ num_ceps (int): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
+ min_duration (float): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+ num_mel_bins (int): Number of triangular mel-frequency bins (Default: ``23``)
+ preemphasis_coefficient (float): Coefficient for use in signal preemphasis (Default: ``0.97``)
+ raw_energy (bool): If True, compute energy before preemphasis and windowing (Default: ``True``)
+ remove_dc_offset: Subtract mean from waveform on each frame (Default: ``True``)
+ round_to_power_of_two (bool): If True, round window size to power of two by zero-padding input
+ to FFT. (Default: ``True``)
+ sample_frequency (float): Waveform data sample frequency (must match the waveform file, if
+ specified there) (Default: ``16000.0``)
+ snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
+ in the file, and the number of frames depends on the frame_length. If False, the number of frames
+ depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+ subtract_mean (bool): Subtract mean of each feature file [CMS]; not recommended to do
+ it this way. (Default: ``False``)
+ use_energy (bool): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+ vtln_high (float): High inflection point in piecewise linear VTLN warping function (if
+ negative, offset from high-mel-freq (Default: ``-500.0``)
+ vtln_low (float): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+ vtln_warp (float): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+ window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (Default: ``'povey'``)
+
+ Returns:
+ torch.Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
+ where m is calculated in _get_strided
+ """
+ assert num_ceps <= num_mel_bins, 'num_ceps cannot be larger than num_mel_bins: %d vs %d' % (num_ceps, num_mel_bins)
+
+ # The mel_energies should not be squared (use_power=True), not have mean subtracted
+ # (subtract_mean=False), and use log (use_log_fbank=True).
+ # size (m, num_mel_bins + use_energy)
+ feature = fbank(waveform=waveform, blackman_coeff=blackman_coeff, channel=channel,
+ dither=dither, energy_floor=energy_floor, frame_length=frame_length,
+ frame_shift=frame_shift, high_freq=high_freq, htk_compat=htk_compat,
+ low_freq=low_freq, min_duration=min_duration, num_mel_bins=num_mel_bins,
+ preemphasis_coefficient=preemphasis_coefficient, raw_energy=raw_energy,
+ remove_dc_offset=remove_dc_offset, round_to_power_of_two=round_to_power_of_two,
+ sample_frequency=sample_frequency, snip_edges=snip_edges, subtract_mean=False,
+ use_energy=use_energy, use_log_fbank=True, use_power=True,
+ vtln_high=vtln_high, vtln_low=vtln_low, vtln_warp=vtln_warp, window_type=window_type)
+
+ if use_energy:
+ # size (m)
+ signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
+ # offset is 0 if htk_compat==True else 1
+ mel_offset = int(not htk_compat)
+ feature = feature[:, mel_offset:(num_mel_bins + mel_offset)]
+
+ # size (num_mel_bins, num_ceps)
+ dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins)
+
+ # size (m, num_ceps)
+ feature = feature.matmul(dct_matrix)
+
+ if cepstral_lifter != 0.0:
+ # size (1, num_ceps)
+ lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
+ feature *= lifter_coeffs
+
+ # if use_energy then replace the last column for htk_compat == true else first column
+ if use_energy:
+ feature[:, 0] = signal_log_energy
+
+ if htk_compat:
+ energy = feature[:, 0].unsqueeze(1) # size (m, 1)
+ feature = feature[:, 1:] # size (m, num_ceps - 1)
+ if not use_energy:
+ # scale on C0 (actually removing a scale we previously added that's
+ # part of one common definition of the cosine transform.)
+ energy *= math.sqrt(2)
+
+ feature = torch.cat((feature, energy), dim=1)
+
+ feature = _subtract_column_mean(feature, subtract_mean)
+ return feature
+
+
def _get_LR_indices_and_weights(orig_freq, new_freq, output_samples_in_unit, window_width,
lowpass_cutoff, lowpass_filter_width):
r"""Based on LinearResample::SetIndexesAndWeights where it retrieves the weights for
@@ -817,7 +949,7 @@ Source code for torchaudio.compliance.kaldi
def _lcm(a, b):
- return abs(a * b) // math.gcd(a, b)
+ return abs(a * b) // fractions.gcd(a, b)
def _get_num_LR_output_samples(input_num_samp, samp_rate_in, samp_rate_out):
@@ -892,7 +1024,7 @@ Source code for torchaudio.compliance.kaldi
assert lowpass_cutoff * 2 <= min_freq
- base_freq = math.gcd(int(orig_freq), int(new_freq))
+ base_freq = fractions.gcd(int(orig_freq), int(new_freq))
input_samples_in_unit = int(orig_freq) // base_freq
output_samples_in_unit = int(new_freq) // base_freq
diff --git a/_modules/torchaudio/datasets/vctk.html b/_modules/torchaudio/datasets/vctk.html
index 6a05437da8..147c4bb4cf 100644
--- a/_modules/torchaudio/datasets/vctk.html
+++ b/_modules/torchaudio/datasets/vctk.html
@@ -215,7 +215,7 @@
Source code for torchaudio.datasets.vctk
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
import torch.utils.data as data
import os
import os.path
diff --git a/_modules/torchaudio/datasets/yesno.html b/_modules/torchaudio/datasets/yesno.html
index 895158770a..e773982335 100644
--- a/_modules/torchaudio/datasets/yesno.html
+++ b/_modules/torchaudio/datasets/yesno.html
@@ -215,7 +215,7 @@
Source code for torchaudio.datasets.yesno
-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
import torch.utils.data as data
import os
import os.path
diff --git a/_modules/torchaudio/functional.html b/_modules/torchaudio/functional.html
index 6144c36350..92edd060a9 100644
--- a/_modules/torchaudio/functional.html
+++ b/_modules/torchaudio/functional.html
@@ -215,7 +215,8 @@
Source code for torchaudio.functional
-import math
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
import torch
@@ -278,8 +279,8 @@ Source code for torchaudio.functional
could be useful. If ``length`` is ``None`` then padding will be aggressively removed
(some loss of signal).
- [1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,”
- IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
+ [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
+ IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
Args:
stft_matrix (torch.Tensor): Output of stft where each row of a channel is a frequency and each
@@ -312,6 +313,7 @@ Source code for torchaudio.functional
# add a channel dimension
stft_matrix = stft_matrix.unsqueeze(0)
+ dtype = stft_matrix.dtype
device = stft_matrix.device
fft_size = stft_matrix.size(1)
assert (onesided and n_fft // 2 + 1 == fft_size) or (not onesided and n_fft == fft_size), (
@@ -330,7 +332,7 @@ Source code for torchaudio.functional
assert 0 < win_length <= n_fft
if window is None:
- window = torch.ones(win_length)
+ window = torch.ones(win_length, requires_grad=False, device=device, dtype=dtype)
assert window.dim() == 1 and window.size(0) == win_length
@@ -353,7 +355,7 @@ Source code for torchaudio.functional
ytmp = ytmp.transpose(1, 2) # size (channel, n_fft, n_frames)
eye = torch.eye(n_fft, requires_grad=False,
- device=device).unsqueeze(1) # size (n_fft, 1, n_fft)
+ device=device, dtype=dtype).unsqueeze(1) # size (n_fft, 1, n_fft)
# this does overlap add where the frames of ytmp are added such that the i'th frame of
# ytmp is added starting at i*hop_length in the output
diff --git a/_modules/torchaudio/kaldi_io.html b/_modules/torchaudio/kaldi_io.html
index 625c2f645b..1b0368f1dd 100644
--- a/_modules/torchaudio/kaldi_io.html
+++ b/_modules/torchaudio/kaldi_io.html
@@ -215,7 +215,8 @@
Source code for torchaudio.kaldi_io
-# To use this file, the dependency (https://github.com/vesis84/kaldi-io-for-python)
+from __future__ import absolute_import, division, print_function, unicode_literals
+# To use this file, the dependency (https://github.com/vesis84/kaldi-io-for-python)
# needs to be installed. This is a light wrapper around kaldi_io that returns
# torch.Tensors.
import torch
diff --git a/_modules/torchaudio/sox_effects.html b/_modules/torchaudio/sox_effects.html
index 88ebdfc210..fd5ce7d101 100644
--- a/_modules/torchaudio/sox_effects.html
+++ b/_modules/torchaudio/sox_effects.html
@@ -215,7 +215,7 @@
Source code for torchaudio.sox_effects
-from __future__ import division, print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
import torch
import _torch_sox
diff --git a/_modules/torchaudio/transforms.html b/_modules/torchaudio/transforms.html
index 28754fddbb..4a4817e32d 100644
--- a/_modules/torchaudio/transforms.html
+++ b/_modules/torchaudio/transforms.html
@@ -215,7 +215,7 @@
Source code for torchaudio.transforms
-from __future__ import division, print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
from warnings import warn
import math
import torch
diff --git a/_sources/compliance.kaldi.rst.txt b/_sources/compliance.kaldi.rst.txt
index 1dfee29eb1..cc75021d69 100644
--- a/_sources/compliance.kaldi.rst.txt
+++ b/_sources/compliance.kaldi.rst.txt
@@ -15,15 +15,20 @@ produce similar outputs.
Functions
---------
+:hidden:`spectrogram`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: spectrogram
+
:hidden:`fbank`
~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: fbank
-:hidden:`spectrogram`
+:hidden:`mfcc`
~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autofunction:: spectrogram
+.. autofunction:: mfcc
:hidden:`resample_waveform`
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/compliance.kaldi.html b/compliance.kaldi.html
index 54af640bbc..634ab9693e 100644
--- a/compliance.kaldi.html
+++ b/compliance.kaldi.html
@@ -223,6 +223,53 @@ torchaudio.compliance.kaldi
Functions¶
+
+spectrogram¶
+
+-
+
torchaudio.compliance.kaldi.spectrogram(waveform, blackman_coeff=0.42, channel=-1, dither=1.0, energy_floor=0.0, frame_length=25.0, frame_shift=10.0, min_duration=0.0, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True, subtract_mean=False, window_type='povey')[source]¶
+Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi’s
+compute-spectrogram-feats.
+
+- Parameters
+
+waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)
+blackman_coeff (float) – Constant coefficient for generalized Blackman window. (Default: 0.42)
+channel (int) – Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: -1)
+dither (float) – Dithering constant (0.0 means no dither). If you turn this off, you should set
+the energy_floor option, e.g. to 1.0 or 0.1 (Default: 1.0)
+energy_floor (float) – Floor on energy (absolute, not relative) in Spectrogram computation. Caution:
+this floor is applied to the zeroth component, representing the total signal energy. The floor on the
+individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: 0.0)
+frame_length (float) – Frame length in milliseconds (Default: 25.0)
+frame_shift (float) – Frame shift in milliseconds (Default: 10.0)
+min_duration (float) – Minimum duration of segments to process (in seconds). (Default: 0.0)
+preemphasis_coefficient (float) – Coefficient for use in signal preemphasis (Default: 0.97)
+raw_energy (bool) – If True, compute energy before preemphasis and windowing (Default: True)
+remove_dc_offset – Subtract mean from waveform on each frame (Default: True)
+round_to_power_of_two (bool) – If True, round window size to power of two by zero-padding input
+to FFT. (Default: True)
+sample_frequency (float) – Waveform data sample frequency (must match the waveform file, if
+specified there) (Default: 16000.0)
+snip_edges (bool) – If True, end effects will be handled by outputting only frames that completely fit
+in the file, and the number of frames depends on the frame_length. If False, the number of frames
+depends only on the frame_shift, and we reflect the data at the ends. (Default: True)
+subtract_mean (bool) – Subtract mean of each feature file [CMS]; not recommended to do
+it this way. (Default: False)
+window_type (str) – Type of window (‘hamming’|’hanning’|’povey’|’rectangular’|’blackman’) (Default: 'povey')
+
+
+- Returns
+A spectrogram identical to what Kaldi would output. The shape is
+(m, padded_window_size // 2 + 1) where m is calculated in _get_strided
+
+- Return type
+-
+
+
+
+
+
fbank¶
@@ -233,7 +280,7 @@ fbank
- Parameters
-waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)
+waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float) – Constant coefficient for generalized Blackman window. (Default: 0.42)
channel (int) – Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: -1)
dither (float) – Dithering constant (0.0 means no dither). If you turn this off, you should set
@@ -276,24 +323,25 @@
fbank
- Return type
--
+
-
-
-spectrogram¶
+
+mfcc¶
--
-
torchaudio.compliance.kaldi.spectrogram(waveform, blackman_coeff=0.42, channel=-1, dither=1.0, energy_floor=0.0, frame_length=25.0, frame_shift=10.0, min_duration=0.0, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True, subtract_mean=False, window_type='povey')[source]¶
-Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi’s
-compute-spectrogram-feats.
+-
+
torchaudio.compliance.kaldi.mfcc(waveform, blackman_coeff=0.42, cepstral_lifter=22.0, channel=-1, dither=1.0, energy_floor=0.0, frame_length=25.0, frame_shift=10.0, high_freq=0.0, htk_compat=False, low_freq=20.0, num_ceps=13, min_duration=0.0, num_mel_bins=23, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True, subtract_mean=False, use_energy=False, vtln_high=-500.0, vtln_low=100.0, vtln_warp=1.0, window_type='povey')[source]¶
+Create a mfcc from a raw audio signal. This matches the input/output of Kaldi’s
+compute-mfcc-feats.
- Parameters
-waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)
+waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)
blackman_coeff (float) – Constant coefficient for generalized Blackman window. (Default: 0.42)
+cepstral_lifter (float) – Constant that controls scaling of MFCCs (Default: 22.0)
channel (int) – Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: -1)
dither (float) – Dithering constant (0.0 means no dither). If you turn this off, you should set
the energy_floor option, e.g. to 1.0 or 0.1 (Default: 1.0)
@@ -302,7 +350,13 @@ spectrogram0.0)
frame_length (float) – Frame length in milliseconds (Default: 25.0)
frame_shift (float) – Frame shift in milliseconds (Default: 10.0)
+high_freq (float) – High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (Default: 0.0)
+htk_compat (bool) – If true, put energy last. Warning: not sufficient to get HTK compatible features (need
+to change other parameters). (Default: False)
+low_freq (float) – Low cutoff frequency for mel bins (Default: 20.0)
+num_ceps (int) – Number of cepstra in MFCC computation (including C0) (Default: 13)
min_duration (float) – Minimum duration of segments to process (in seconds). (Default: 0.0)
+num_mel_bins (int) – Number of triangular mel-frequency bins (Default: 23)
preemphasis_coefficient (float) – Coefficient for use in signal preemphasis (Default: 0.97)
raw_energy (bool) – If True, compute energy before preemphasis and windowing (Default: True)
remove_dc_offset – Subtract mean from waveform on each frame (Default: True)
@@ -315,15 +369,20 @@ spectrogramTrue)
subtract_mean (bool) – Subtract mean of each feature file [CMS]; not recommended to do
it this way. (Default: False)
+use_energy (bool) – Add an extra dimension with energy to the FBANK output. (Default: False)
+vtln_high (float) – High inflection point in piecewise linear VTLN warping function (if
+negative, offset from high-mel-freq (Default: -500.0)
+vtln_low (float) – Low inflection point in piecewise linear VTLN warping function (Default: 100.0)
+vtln_warp (float) – Vtln warp factor (only applicable if vtln_map not specified) (Default: 1.0)
window_type (str) – Type of window (‘hamming’|’hanning’|’povey’|’rectangular’|’blackman’) (Default: 'povey')
- Returns
-A spectrogram identical to what Kaldi would output. The shape is
-(m, padded_window_size // 2 + 1) where m is calculated in _get_strided
+A mfcc identical to what Kaldi would output. The shape is (m, num_ceps)
+where m is calculated in _get_strided
- Return type
--
+
-
@@ -344,7 +403,7 @@ resample_waveform
Parameters
-waveform (torch.Tensor) – The input signal of size (c, n)
+waveform (torch.Tensor) – The input signal of size (c, n)
orig_freq (float) – The original frequency of the signal
new_freq (float) – The desired frequency
lowpass_filter_width (int) – Controls the sharpness of the filter, more == sharper
@@ -355,7 +414,7 @@
resample_waveformThe waveform at the new frequency
Return type
-
+
@@ -409,8 +468,9 @@ resample_waveform
torchaudio.compliance.kaldi
- Functions
diff --git a/datasets.html b/datasets.html
index 1f01574a17..80b61c03c9 100644
--- a/datasets.html
+++ b/datasets.html
@@ -218,9 +218,9 @@
torchaudio.datasets¶
-All datasets are subclasses of torch.utils.data.Dataset
+
All datasets are subclasses of torch.utils.data.Dataset
i.e, they have __getitem__ and __len__ methods implemented.
-Hence, they can all be passed to a torch.utils.data.DataLoader
+Hence, they can all be passed to a torch.utils.data.DataLoader
which can load multiple samples parallelly using torch.multiprocessing workers.
For example:
yesno_data = torchaudio.datasets.YESNO('.', download=True)
@@ -277,7 +277,7 @@ VCTKReturn type
-Tuple[torch.Tensor, int]
+Tuple[torch.Tensor, int]
@@ -321,7 +321,7 @@ YESNOReturn type
-Tuple[torch.Tensor, int]
+Tuple[torch.Tensor, int]
diff --git a/functional.html b/functional.html
index b35433f87f..a140ea1180 100644
--- a/functional.html
+++ b/functional.html
@@ -243,18 +243,18 @@ istftlength is None then padding will be aggressively removed
(some loss of signal).
[1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,”
-IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
+IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
- Parameters
-stft_matrix (torch.Tensor) – Output of stft where each row of a channel is a frequency and each
+
stft_matrix (torch.Tensor) – Output of stft where each row of a channel is a frequency and each
column is a window. it has a size of either (channel, fft_size, n_frames, 2) or (
fft_size, n_frames, 2)
n_fft (int) – Size of Fourier transform
hop_length (Optional[int]) – The distance between neighboring sliding window frames.
(Default: win_length // 4)
win_length (Optional[int]) – The size of window frame and STFT filter. (Default: n_fft)
-window (Optional[torch.Tensor]) – The optional window function.
+
window (Optional[torch.Tensor]) – The optional window function.
(Default: torch.ones(win_length))
center (bool) – Whether input was padded on both sides so
that the \(t\)-th frame is centered at time \(t \times \text{hop\_length}\).
@@ -272,7 +272,7 @@
istft
- Return type
--
+
-
@@ -287,9 +287,9 @@ spectrogram
- Parameters
-waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
+waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
pad (int) – Two sided padding of signal
-window (torch.Tensor) – Window tensor that is applied/multiplied to each frame/window
+window (torch.Tensor) – Window tensor that is applied/multiplied to each frame/window
n_fft (int) – Size of FFT
hop_length (int) – Length of hop between STFT windows
win_length (int) – Window size
@@ -304,7 +304,7 @@ spectrogramReturn type
--
+
-
@@ -322,7 +322,7 @@ amplitude_to_DB
- Parameters
-x (torch.Tensor) – Input tensor before being converted to decibel scale
+x (torch.Tensor) – Input tensor before being converted to decibel scale
multiplier (float) – Use 10. for power and 20. for amplitude
amin (float) – Number to clamp x
db_multiplier (float) – Log10(max(reference value and amin))
@@ -334,7 +334,7 @@ amplitude_to_DBOutput tensor in decibel scale
- Return type
--
+
-
@@ -363,7 +363,7 @@ create_fb_matrixA * create_fb_matrix(A.size(-1), ...).
- Return type
--
+
-
@@ -389,7 +389,7 @@ create_dctn_mels, n_mfcc).
- Return type
--
+
-
@@ -407,7 +407,7 @@ mu_law_encoding
- Parameters
-x (torch.Tensor) – Input tensor
+x (torch.Tensor) – Input tensor
quantization_channels (int) – Number of channels
@@ -415,7 +415,7 @@ mu_law_encodingInput after mu-law encoding
- Return type
--
+
-
@@ -433,7 +433,7 @@ mu_law_decoding
- Parameters
-x_mu (torch.Tensor) – Input tensor
+x_mu (torch.Tensor) – Input tensor
quantization_channels (int) – Number of channels
@@ -441,7 +441,7 @@ mu_law_decodingInput after mu-law decoding
- Return type
--
+
-
@@ -456,7 +456,7 @@ complex_norm
- Parameters
-complex_tensor (torch.Tensor) – Tensor shape of (*, complex=2)
+complex_tensor (torch.Tensor) – Tensor shape of (*, complex=2)
power (float) – Power of the norm. (Default: 1.0).
@@ -464,7 +464,7 @@ complex_normPower of the normed input tensor. Shape of (*, )
- Return type
--
+
-
@@ -478,13 +478,13 @@ angleCompute the angle of complex tensor input.
- Parameters
-complex_tensor (torch.Tensor) – Tensor shape of (*, complex=2)
+complex_tensor (torch.Tensor) – Tensor shape of (*, complex=2)
- Returns
Angle of a complex tensor. Shape of (*, )
- Return type
--
+
-
@@ -499,7 +499,7 @@ magphase
- Parameters
-complex_tensor (torch.Tensor) – Tensor shape of (*, complex=2)
+complex_tensor (torch.Tensor) – Tensor shape of (*, complex=2)
power (float) – Power of the norm. (Default: 1.0)
@@ -507,7 +507,7 @@ magphaseThe magnitude and phase of the complex tensor
- Return type
-Tuple[torch.Tensor, torch.Tensor]
+Tuple[torch.Tensor, torch.Tensor]
@@ -523,9 +523,9 @@ phase_vocoder
- Parameters
-complex_specgrams (torch.Tensor) – Dimension of (*, channel, freq, time, complex=2)
+complex_specgrams (torch.Tensor) – Dimension of (*, channel, freq, time, complex=2)
rate (float) – Speed-up factor
-phase_advance (torch.Tensor) – Expected phase advance in each bin. Dimension
+
phase_advance (torch.Tensor) – Expected phase advance in each bin. Dimension
of (freq, 1)
@@ -534,7 +534,7 @@ phase_vocoder
- Return type
-complex_specgrams_stretch (torch.Tensor)
+complex_specgrams_stretch (torch.Tensor)
diff --git a/genindex.html b/genindex.html
index 9466ca5383..89a6283029 100644
--- a/genindex.html
+++ b/genindex.html
@@ -356,6 +356,8 @@ M
+ - mfcc() (in module torchaudio.compliance.kaldi)
+
- mu_law_decoding() (in module torchaudio.functional)
- mu_law_encoding() (in module torchaudio.functional)
diff --git a/index.html b/index.html
index fc535b8f52..39731b8a8e 100644
--- a/index.html
+++ b/index.html
@@ -364,7 +364,7 @@
torchaudioParameters
filepath (str or pathlib.Path) – Path to audio file
-out (torch.Tensor, optional) – An output tensor to use instead of creating one. (Default: None)
+out (torch.Tensor, optional) – An output tensor to use instead of creating one. (Default: None)
normalization (bool, number, or callable, optional) – If boolean True, then output is divided by 1 << 31
(assumes signed 32-bit audio), and normalizes to [0, 1].
If number, then output is divided by that number
@@ -390,7 +390,7 @@
torchaudioReturn type
-Tuple[torch.Tensor, int]
+Tuple[torch.Tensor, int]
@@ -436,7 +436,7 @@ torchaudioParameters
filepath (str) – Path to audio file
-src (torch.Tensor) – An input 2D tensor of shape [C x L] or [L x C] where L is
+
src (torch.Tensor) – An input 2D tensor of shape [C x L] or [L x C] where L is
the number of audio frames, C is the number of channels
sample_rate (int) – An integer which is the sample rate of the
audio (as listed in the metadata of the file)
@@ -456,7 +456,7 @@ torchaudioParameters
filepath (str) – Path to audio file
-src (torch.Tensor) – An input 2D tensor of shape [C x L] or [L x C] where L is
+
src (torch.Tensor) – An input 2D tensor of shape [C x L] or [L x C] where L is
the number of audio frames, C is the number of channels
channels_first (bool) – Set channels first or length first in result. (Default: True)
signalinfo (sox_signalinfo_t) – A sox_signalinfo_t type, which could be helpful if the
diff --git a/kaldi_io.html b/kaldi_io.html
index c47e00e862..aa8b64ab65 100644
--- a/kaldi_io.html
+++ b/kaldi_io.html
@@ -219,7 +219,7 @@
torchaudio.kaldi_io¶
To use this module, the dependency kaldi_io needs to be installed.
-This is a light wrapper around kaldi_io that returns torch.Tensor.
+This is a light wrapper around kaldi_io that returns torch.Tensor.
Vectors¶
@@ -236,7 +236,7 @@ read_vec_int_arkThe string is the key and the tensor is the vector read from file
- Return type
-Generator[str, torch.Tensor]
+Generator[str, torch.Tensor]
@@ -263,7 +263,7 @@ read_vec_flt_scpThe string is the key and the tensor is the vector read from file
- Return type
-Generator[str, torch.Tensor]
+Generator[str, torch.Tensor]
@@ -290,7 +290,7 @@ read_vec_flt_arkThe string is the key and the tensor is the vector read from file
- Return type
-Generator[str, torch.Tensor]
+Generator[str, torch.Tensor]
@@ -320,7 +320,7 @@ read_mat_scpThe string is the key and the tensor is the matrix read from file
- Return type
-Generator[str, torch.Tensor]
+Generator[str, torch.Tensor]
@@ -347,7 +347,7 @@ read_mat_arkThe string is the key and the tensor is the matrix read from file
- Return type
-Generator[str, torch.Tensor]
+Generator[str, torch.Tensor]
diff --git a/objects.inv b/objects.inv
index 72131abc16..cd2e8b74b7 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/searchindex.js b/searchindex.js
index 6b9aa981fc..063bad04fc 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["compliance.kaldi","datasets","functional","index","kaldi_io","sox_effects","transforms"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["compliance.kaldi.rst","datasets.rst","functional.rst","index.rst","kaldi_io.rst","sox_effects.rst","transforms.rst"],objects:{"":{torchaudio:[3,0,0,"-"]},"torchaudio._docs.AmplitudeToDB":{forward:[6,1,1,""]},"torchaudio._docs.MFCC":{forward:[6,1,1,""]},"torchaudio._docs.MelScale":{forward:[6,1,1,""]},"torchaudio._docs.MelSpectrogram":{forward:[6,1,1,""]},"torchaudio._docs.MuLawDecoding":{forward:[6,1,1,""]},"torchaudio._docs.MuLawEncoding":{forward:[6,1,1,""]},"torchaudio._docs.Resample":{forward:[6,1,1,""]},"torchaudio._docs.Spectrogram":{forward:[6,1,1,""]},"torchaudio.compliance.kaldi":{fbank:[0,2,1,""],resample_waveform:[0,2,1,""],spectrogram:[0,2,1,""]},"torchaudio.datasets":{VCTK:[1,3,1,""],YESNO:[1,3,1,""]},"torchaudio.datasets.VCTK":{__getitem__:[1,1,1,""]},"torchaudio.datasets.YESNO":{__getitem__:[1,1,1,""]},"torchaudio.functional":{amplitude_to_DB:[2,2,1,""],angle:[2,2,1,""],complex_norm:[2,2,1,""],create_dct:[2,2,1,""],create_fb_matrix:[2,2,1,""],istft:[2,2,1,""],magphase:[2,2,1,""],mu_law_decoding:[2,2,1,""],mu_law_encoding:[2,2,1,""],phase_vocoder:[2,2,1,""],spectrogram:[2,2,1,""]},"torchaudio.kaldi_io":{read_mat_ark:[4,2,1,""],read_mat_scp:[4,2,1,""],read_vec_flt_ark:[4,2,1,""],read_vec_flt_scp:[4,2,1,""],read_vec_int_ark:[4,2,1,""]},"torchaudio.sox_effects":{SoxEffect:[5,3,1,""],SoxEffectsChain:[5,3,1,""]},"torchaudio.sox_effects.SoxEffectsChain":{append_effect_to_chain:[5,1,1,""],clear_chain:[5,1,1,""],set_input_file:[5,1,1,""],sox_build_flow_effects:[5,1,1,""]},"torchaudio.transforms":{AmplitudeToDB:[6,3,1,""],MFCC:[6,3,1,""],MelScale:[6,3,1,""],MelSpectrogram:[6,3,1,""],MuLawDecoding:[6,3,1,""],MuLawEncoding:[6,3,1,""],Resample:[6,3,1,""],Spectrogram:[6,3,1,""]},torchaudio:{get_sox_bool:[3,2,1,""],get_sox_encoding_t:[3,2,1,""],get_sox_option_t:[3,2,1,""],info:[3,2,1,""],initialize_sox:[3,2,1,""],load:[3,2,1,""],load_wav:[3,2,1,""],save:[3,2,1,""],save_encinfo:[3,2,1,""],shutdown_sox:[3,2,1,""],sox_encodinginfo_t:[3,2,1,""],sox_signalinfo_t:[3,2,1,""]}},objnames:{"0":["py","module","Python module"],"1":["py","method","Python method"],"2":["py","function","Python function"],"3":["py","class","Python class"]},objtypes:{"0":"py:module","1":"py:method","2":"py:function","3":"py:class"},terms:{"16000hz":5,"179d6e9a88202ab0a2f":6,"boolean":[3,5],"byte":3,"class":[1,5,6],"default":[0,1,2,3,5,6],"enum":3,"final":3,"float":[0,2,3,6],"function":[1,3,5,6],"import":2,"int":[0,1,2,3,4,5,6],"new":0,"return":[0,1,2,3,4,5,6],"short":2,"true":[0,1,2,3,5,6],CMS:0,For:[1,2,3,6],Not:3,The:[0,1,2,3,4,5,6],These:2,Use:2,Useful:1,__getitem__:[1,5],__init__:5,__len__:[1,5],__members__:3,_get_strid:0,_length:2,_modul:6,abs:3,absolut:0,accord:4,add:0,addit:2,advanc:2,after:[2,3,6],again:1,aggress:2,algorithm:[2,6],all:[1,2,3],almost:1,alreadi:1,altern:1,amin:2,amount:2,amplitud:[2,6],amplitude_to_db:3,amplitudetodb:3,angl:3,anoth:6,api:1,append:5,append_effect_to_chain:5,appli:[0,2,6],applic:0,apr:2,arg:1,argument:[1,6],ark:4,around:[0,4],asr:0,assp:2,assum:[2,3,5,6],attempt:3,attribut:5,audio:[0,1,2,3,5,6],audiodir_path:5,automat:[3,5],avail:1,bandlimit:0,bank:[2,6],base:[2,6],batch:2,batch_siz:1,becaus:2,been:[2,6],befor:[0,2,6],begin:3,being:[2,6],between:[2,5,6],bin:[0,2,6],bit:[3,5],bits_per_sampl:3,blackman:0,blackman_coeff:0,blob:0,bool:[0,1,2,3,5,6],both:2,build:5,built:6,calcul:[0,2,6],call:3,callabl:[1,3,5,6],can:[0,1,2,3,6],cancel:2,cannot:[2,3,5],caution:0,ccrma:0,ceil:2,center:2,cepstrum:6,certain:2,chain:[3,5,6],chang:[0,3],channel:[0,2,3,5,6],channels_first:[3,5],check:2,choos:3,clamp:2,clean:1,clear:5,clear_chain:5,clip:[2,6],coeffici:[0,2,6],column:2,com:[0,6],common:[1,2,3,6],compand:[2,6],compat:0,complet:0,complex:2,complex_norm:3,complex_specgram:2,complex_specgrams_stretch:2,complex_tensor:2,complianc:3,compon:0,composit:6,compress:3,comput:[0,2],condit:2,consider:2,consist:[3,6],constant:[0,2],control:[0,2,6],conveni:3,convers:[2,6],convert:[2,6],core:6,cosin:6,could:[2,3,5],creat:[0,2,3,4,5,6],create_dct:3,create_fb_matrix:3,cut:[2,6],cutoff:0,data:[0,1,2,3,5],data_load:1,data_vol_norm:3,dataload:1,dataset:[3,5],db_multipli:2,dct:[2,6],dct_type:6,decibel:[2,6],decod:[2,6],def:5,depend:[0,2,4,6],descriptor:4,desir:[0,6],determin:[3,5],dev_mod:1,devic:6,dict:[3,6],dictionari:4,differ:[2,6],dimens:[0,2,6],directori:1,discard:2,discret:6,disk:3,distanc:2,dither:0,divid:[3,5],down:2,download:1,downsampl:[0,1],durat:0,each:[0,2,3,6],earg:5,edu:0,effect:[0,3,5],effici:0,either:2,element:[0,2],elementwis:6,els:0,enam:5,encod:[2,3,6],encodinginfo:3,end:[0,2],endian:3,energi:[0,2,6],energy_floor:0,entri:[2,6],envelop:2,eopt:5,epsilon:0,estim:2,etc:[2,3,6],everyth:3,exactli:2,exampl:[1,2,3,4,5,6],exist:1,expect:[0,2,6],expon:[2,6],extens:[3,5],extra:0,extract:0,f_max:[2,6],f_min:[2,6],factor:[0,2,3],fals:[0,1,2,6],fault:3,feat:0,featur:0,fft:[0,2,6],fft_size:2,file:[0,1,3,4,5],file_or_fd:4,filedescriptor:4,filepath:3,filetyp:[3,5],filter:[0,2,6],filterbank:[0,2,6],finish:3,first:[3,5,6],fit:[0,2],fix:0,float32:4,float64:4,floor:0,flow:5,follow:[1,5],foo:3,format:3,forward:6,fourier:[2,6],frame:[0,2,3,5,6],frame_length:0,frame_shift:0,freq:[0,2,6],frequenc:[0,2,6],from:[0,1,2,3,4,5,6],full:[2,6],gener:[0,4],get:[0,3],get_sox_bool:3,get_sox_encoding_t:3,get_sox_option_t:3,gist:6,github:[0,6],give:6,given:[0,2,3,5,6],griffin:2,gzip:4,ham:0,han:0,handl:0,hann_window:6,has:[0,2,6],have:1,haythamfayek:6,headroom:3,hebrew:1,help:[3,5],henc:1,here:[5,6],high:0,high_freq:0,highlight:2,hop:[2,6],hop_length:[2,6],htk:0,htk_compat:0,html:[0,6],http:[0,6],ident:0,ieee:2,imag:1,implement:[1,6],importantli:3,index:[1,5],individu:0,inflect:0,info:[2,3,6],inform:[2,5],infti:2,initi:3,initialize_sox:[3,5],input:[0,1,2,3,5,6],input_fil:5,instal:4,instead:[3,6],integ:[3,5],internet:1,interpol:0,interv:0,invers:[2,6],isn:2,istft:3,its:2,join:5,jos:0,kaldi:[3,4],kaldi_io:3,kastnerkyl:6,keep:1,kei:4,kwarg:3,l56:0,lambda:3,last:[0,2],law:[2,6],learn:6,least:2,left:[0,2],len:5,length:[0,2,3,5,6],less:0,librosa:6,light:4,like:3,lim:2,linear:0,linearli:0,linearresampl:0,linspac:2,list:[3,5],listdir:5,load:[1,3,6],load_wav:3,log10:2,log:[0,6],log_mel:6,loss:2,lossi:3,low:0,low_freq:0,lowpass_filter_width:0,machin:6,magnitud:[0,2,6],magphas:3,mai:[2,6],master:0,match:0,math:2,matric:3,matrix:[2,4,6],max:[2,3],maximum:[2,6],mean:[0,2],mel:[0,2,6],mel_specgram:6,melkwarg:6,melscal:3,melspectrogram:3,metadata:[3,5],method:[1,2,6],mfc:[2,6],mfcc:3,millisecond:0,min_dur:0,minimum:[0,2,6],modifi:2,modul:4,mono:[0,5],more:[0,2,6],mp3:3,mu_law_decod:3,mu_law_encod:3,mulawdecod:3,mulawencod:3,mult:3,multipl:1,multipli:[2,3,6],multiprocess:1,must:[0,2,6],mydataset:5,n_fft:[2,6],n_frame:[2,6],n_freq:2,n_mel:[2,6],n_mfcc:[2,6],n_stft:6,name:5,need:[0,3,4,6],neg:[0,2,6],neighbor:2,never:2,new_freq:[0,6],nibbl:3,nola:2,none:[1,2,3,5,6],nonzero:2,norm:[2,6],normal:[0,2,3,5,6],note:3,nthread:1,num_fram:3,num_freq:2,num_mel_bin:0,num_work:1,number:[0,2,3,5,6],numer:6,numeric_limit:0,nyquist:0,object:[3,5,6],occur:2,off:[0,2,6],offlinefeaturetpl:0,offset:[0,3],onc:3,one:[3,6],ones:2,onesid:2,onli:[0,3],open:4,oper:[0,2],opposite_endian:3,option:[0,1,2,3,5,6],orig_freq:[0,6],origin:[0,2,6],ortho:[2,6],other:0,out:[3,5],out_encinfo:5,out_siginfo:5,output:[0,1,2,3,5,6],overlap:2,packag:3,pad:[0,2,6],pad_mod:2,padded_window_s:0,parallelli:1,paramet:[0,1,2,3,4,5,6],pass:[1,3,5],path:[3,5],path_to_audio_fil:5,pathlib:3,per:3,perform:[0,1,2],phase:2,phase_adv:2,phase_vocod:3,piecewis:0,pil:1,pipe:4,pitch:2,point:[0,2],popular:3,possibl:3,povei:0,power:[0,2,6],practic:3,precis:3,preemphasi:0,preemphasis_coeffici:0,preprocess:5,prevent:2,primarili:3,print:3,process:[0,1,6],produc:0,provid:2,put:[0,1],python:[3,5,6],quantization_channel:[2,6],randn:2,rang:0,rate:[2,3,5,6],rather:3,raw:[0,1,2,5,6],raw_energi:0,read:4,reason:[2,6],recommend:0,rectangular:0,refer:[2,3],reflect:[0,2],rel:0,remov:2,remove_dc_offset:0,repres:0,requir:3,resampl:[0,3,5],resamplewaveform:0,resampling_method:6,respect:1,result:[2,3,5],retain:[2,6],revers:3,reverse_bit:3,reverse_byt:3,reverse_nibbl:3,right:[0,2,3],root:1,round:0,round_to_power_of_two:0,row:2,run:3,same:[2,3],sampl:[0,1,3,5,6],sample_frequ:0,sample_r:[3,6],save:3,save_encinfo:3,scale:[2,6],scp:4,second:0,see:[2,3,6],seg:3,segment:0,self:5,separ:2,sequenti:6,set:[0,3,5],set_input_fil:5,shape:[0,2,3],sharp:0,sharper:0,shift:[0,3],shorter:2,should:[0,2],showdown:3,shuffl:1,shutdown:3,shutdown_sox:[3,5],side:[2,6],sig:5,sign:[3,5],signal:[0,1,2,3,5,6],signal_length:2,signalinfo:3,similar:[0,1],simpl:3,sinc:[0,2],sinc_interpol:6,size:[0,2,3,5,6],slide:2,slow:2,snip_edg:0,snippet:[2,6],some:2,someth:5,sourc:[0,1,2,3,4,5,6],sox:[3,5],sox_bool:3,sox_build_flow_effect:5,sox_effect:3,sox_encoding_t:3,sox_encodinginfo_t:[3,5],sox_fals:3,sox_option_default:3,sox_option_t:3,sox_signalinfo_t:[3,5],soxeffect:3,soxeffectschain:3,space:0,spec_f:6,specgram:6,specgram_mel_db:6,specif:[2,3],specifi:[0,3],spectrogram:[1,3],spectrum:6,speech:6,speed:2,split:[2,6],squar:[2,6],src:[0,3],stabl:6,standard:3,stanford:0,start:3,std:0,stft:[2,6],stft_matrix:2,str:[0,1,2,3,4,5,6],stream:4,string:4,stype:6,subclass:1,subtract:0,subtract_mean:0,suffici:0,suggest:0,sum_:2,summat:2,suppos:2,take:1,target:1,target_transform:1,tensor:[0,1,2,3,4,5,6],test:[1,6],text:2,textbook:6,than:2,thei:[1,2,6],theory_ideal_bandlimited_interpol:0,thi:[0,2,3,4,6],time:[2,6],timsainb:6,togeth:6,top_db:[2,6],torch:[0,1,2,3,4,5,6],total:0,train:1,tran:2,transcript:1,transform:[1,2,3],triangular:[0,2,6],trim:2,tupl:[1,2,3,4,5],turn:[0,2,6],two:[0,1,2,6],type:[0,1,2,3,4,5,6],ulaw:3,unchang:[2,6],uniqu:3,unknown:3,unspecifi:3,upsampl:0,url:1,use:[0,2,3,4,6],use_energi:0,use_log_fbank:0,use_pow:0,used:[2,3],useful:[0,2],user:6,uses:[0,3,6],using:[1,2,6],util:1,valu:[2,3,6],variou:0,vctk:3,vector:3,version:1,vol:2,vtln:0,vtln_high:0,vtln_low:0,vtln_map:0,vtln_warp:0,wai:0,warn:0,warp:0,wav:[3,6],wave:3,waveform:[0,2,6],what:0,when:[2,3],where:[0,1,2,3,5,6],whether:[1,2,6],which:[0,1,2,3,4,5,6],whole:2,wikipedia:[2,6],win_length:[2,6],window:[0,2,6],window_fn:6,window_typ:0,wise:2,without:[2,3],wkwarg:6,worker:1,would:[0,2],wrapper:4,written:5,x_mu:[2,6],yesno:3,yesno_data:1,you:[0,3],zero:[0,2],zeroth:0},titles:["torchaudio.compliance.kaldi","torchaudio.datasets","torchaudio.functional","torchaudio","torchaudio.kaldi_io","torchaudio.sox_effects","torchaudio.transforms"],titleterms:{"function":[0,2],amplitude_to_db:2,amplitudetodb:6,angl:2,complex_norm:2,complianc:0,create_dct:2,create_fb_matrix:2,dataset:1,fbank:0,istft:2,kaldi:0,kaldi_io:4,magphas:2,matric:4,melscal:6,melspectrogram:6,mfcc:6,mu_law_decod:2,mu_law_encod:2,mulawdecod:6,mulawencod:6,phase_vocod:2,read_mat_ark:4,read_mat_scp:4,read_vec_flt_ark:4,read_vec_flt_scp:4,read_vec_int_ark:4,resampl:6,resample_waveform:0,sox_effect:5,soxeffect:5,soxeffectschain:5,spectrogram:[0,2,6],torchaudio:[0,1,2,3,4,5,6],transform:6,vctk:1,vector:4,yesno:1}})
\ No newline at end of file
+Search.setIndex({docnames:["compliance.kaldi","datasets","functional","index","kaldi_io","sox_effects","transforms"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["compliance.kaldi.rst","datasets.rst","functional.rst","index.rst","kaldi_io.rst","sox_effects.rst","transforms.rst"],objects:{"":{torchaudio:[3,0,0,"-"]},"torchaudio._docs.AmplitudeToDB":{forward:[6,1,1,""]},"torchaudio._docs.MFCC":{forward:[6,1,1,""]},"torchaudio._docs.MelScale":{forward:[6,1,1,""]},"torchaudio._docs.MelSpectrogram":{forward:[6,1,1,""]},"torchaudio._docs.MuLawDecoding":{forward:[6,1,1,""]},"torchaudio._docs.MuLawEncoding":{forward:[6,1,1,""]},"torchaudio._docs.Resample":{forward:[6,1,1,""]},"torchaudio._docs.Spectrogram":{forward:[6,1,1,""]},"torchaudio.compliance.kaldi":{fbank:[0,2,1,""],mfcc:[0,2,1,""],resample_waveform:[0,2,1,""],spectrogram:[0,2,1,""]},"torchaudio.datasets":{VCTK:[1,3,1,""],YESNO:[1,3,1,""]},"torchaudio.datasets.VCTK":{__getitem__:[1,1,1,""]},"torchaudio.datasets.YESNO":{__getitem__:[1,1,1,""]},"torchaudio.functional":{amplitude_to_DB:[2,2,1,""],angle:[2,2,1,""],complex_norm:[2,2,1,""],create_dct:[2,2,1,""],create_fb_matrix:[2,2,1,""],istft:[2,2,1,""],magphase:[2,2,1,""],mu_law_decoding:[2,2,1,""],mu_law_encoding:[2,2,1,""],phase_vocoder:[2,2,1,""],spectrogram:[2,2,1,""]},"torchaudio.kaldi_io":{read_mat_ark:[4,2,1,""],read_mat_scp:[4,2,1,""],read_vec_flt_ark:[4,2,1,""],read_vec_flt_scp:[4,2,1,""],read_vec_int_ark:[4,2,1,""]},"torchaudio.sox_effects":{SoxEffect:[5,3,1,""],SoxEffectsChain:[5,3,1,""]},"torchaudio.sox_effects.SoxEffectsChain":{append_effect_to_chain:[5,1,1,""],clear_chain:[5,1,1,""],set_input_file:[5,1,1,""],sox_build_flow_effects:[5,1,1,""]},"torchaudio.transforms":{AmplitudeToDB:[6,3,1,""],MFCC:[6,3,1,""],MelScale:[6,3,1,""],MelSpectrogram:[6,3,1,""],MuLawDecoding:[6,3,1,""],MuLawEncoding:[6,3,1,""],Resample:[6,3,1,""],Spectrogram:[6,3,1,""]},torchaudio:{get_sox_bool:[3,2,1,""],get_sox_encoding_t:[3,2,1,""],get_sox_option_t:[3,2,1,""],info:[3,2,1,""],initialize_sox:[3,2,1,""],load:[3,2,1,""],load_wav:[3,2,1,""],save:[3,2,1,""],save_encinfo:[3,2,1,""],shutdown_sox:[3,2,1,""],sox_encodinginfo_t:[3,2,1,""],sox_signalinfo_t:[3,2,1,""]}},objnames:{"0":["py","module","Python module"],"1":["py","method","Python method"],"2":["py","function","Python function"],"3":["py","class","Python class"]},objtypes:{"0":"py:module","1":"py:method","2":"py:function","3":"py:class"},terms:{"16000hz":5,"179d6e9a88202ab0a2f":6,"boolean":[3,5],"byte":3,"class":[1,5,6],"default":[0,1,2,3,5,6],"enum":3,"final":3,"float":[0,2,3,6],"function":[1,3,5,6],"import":2,"int":[0,1,2,3,4,5,6],"new":0,"return":[0,1,2,3,4,5,6],"short":2,"true":[0,1,2,3,5,6],CMS:0,For:[1,2,3,6],Not:3,The:[0,1,2,3,4,5,6],These:2,Use:2,Useful:1,__getitem__:[1,5],__init__:5,__len__:[1,5],__members__:3,_get_strid:0,_length:2,_modul:6,abs:3,absolut:0,accord:4,add:0,addit:2,advanc:2,after:[2,3,6],again:1,aggress:2,algorithm:[2,6],all:[1,2,3],almost:1,alreadi:1,altern:1,amin:2,amount:2,amplitud:[2,6],amplitude_to_db:3,amplitudetodb:3,angl:3,anoth:6,api:1,append:5,append_effect_to_chain:5,appli:[0,2,6],applic:0,apr:2,arg:1,argument:[1,6],ark:4,around:[0,4],asr:0,assp:2,assum:[2,3,5,6],attempt:3,attribut:5,audio:[0,1,2,3,5,6],audiodir_path:5,automat:[3,5],avail:1,bandlimit:0,bank:[2,6],base:[2,6],batch:2,batch_siz:1,becaus:2,been:[2,6],befor:[0,2,6],begin:3,being:[2,6],between:[2,5,6],bin:[0,2,6],bit:[3,5],bits_per_sampl:3,blackman:0,blackman_coeff:0,blob:0,bool:[0,1,2,3,5,6],both:2,build:5,built:6,calcul:[0,2,6],call:3,callabl:[1,3,5,6],can:[0,1,2,3,6],cancel:2,cannot:[2,3,5],caution:0,ccrma:0,ceil:2,center:2,cepstra:0,cepstral_lift:0,cepstrum:6,certain:2,chain:[3,5,6],chang:[0,3],channel:[0,2,3,5,6],channels_first:[3,5],check:2,choos:3,clamp:2,clean:1,clear:5,clear_chain:5,clip:[2,6],coeffici:[0,2,6],column:2,com:[0,6],common:[1,2,3,6],compand:[2,6],compat:0,complet:0,complex:2,complex_norm:3,complex_specgram:2,complex_specgrams_stretch:2,complex_tensor:2,complianc:3,compon:0,composit:6,compress:3,comput:[0,2],condit:2,consider:2,consist:[3,6],constant:[0,2],control:[0,2,6],conveni:3,convers:[2,6],convert:[2,6],core:6,cosin:6,could:[2,3,5],creat:[0,2,3,4,5,6],create_dct:3,create_fb_matrix:3,cut:[2,6],cutoff:0,data:[0,1,2,3,5],data_load:1,data_vol_norm:3,dataload:1,dataset:[3,5],db_multipli:2,dct:[2,6],dct_type:6,decibel:[2,6],decod:[2,6],def:5,depend:[0,2,4,6],descriptor:4,desir:[0,6],determin:[3,5],dev_mod:1,devic:6,dict:[3,6],dictionari:4,differ:[2,6],dimens:[0,2,6],directori:1,discard:2,discret:6,disk:3,distanc:2,dither:0,divid:[3,5],down:2,download:1,downsampl:[0,1],durat:0,each:[0,2,3,6],earg:5,edu:0,effect:[0,3,5],effici:0,either:2,element:[0,2],elementwis:6,els:0,enam:5,encod:[2,3,6],encodinginfo:3,end:[0,2],endian:3,energi:[0,2,6],energy_floor:0,entri:[2,6],envelop:2,eopt:5,epsilon:0,estim:2,etc:[2,3,6],everyth:3,exactli:2,exampl:[1,2,3,4,5,6],exist:1,expect:[0,2,6],expon:[2,6],extens:[3,5],extra:0,extract:0,f_max:[2,6],f_min:[2,6],factor:[0,2,3],fals:[0,1,2,6],fault:3,feat:0,featur:0,fft:[0,2,6],fft_size:2,file:[0,1,3,4,5],file_or_fd:4,filedescriptor:4,filepath:3,filetyp:[3,5],filter:[0,2,6],filterbank:[0,2,6],finish:3,first:[3,5,6],fit:[0,2],fix:0,float32:4,float64:4,floor:0,flow:5,follow:[1,5],foo:3,format:3,forward:6,fourier:[2,6],frame:[0,2,3,5,6],frame_length:0,frame_shift:0,freq:[0,2,6],frequenc:[0,2,6],from:[0,1,2,3,4,5,6],full:[2,6],gener:[0,4],get:[0,3],get_sox_bool:3,get_sox_encoding_t:3,get_sox_option_t:3,gist:6,github:[0,6],give:6,given:[0,2,3,5,6],griffin:2,gzip:4,ham:0,han:0,handl:0,hann_window:6,has:[0,2,6],have:1,haythamfayek:6,headroom:3,hebrew:1,help:[3,5],henc:1,here:[5,6],high:0,high_freq:0,highlight:2,hop:[2,6],hop_length:[2,6],htk:0,htk_compat:0,html:[0,6],http:[0,6],ident:0,ieee:2,imag:1,implement:[1,6],importantli:3,includ:0,index:[1,5],individu:0,inflect:0,info:[2,3,6],inform:[2,5],infti:2,initi:3,initialize_sox:[3,5],input:[0,1,2,3,5,6],input_fil:5,instal:4,instead:[3,6],integ:[3,5],internet:1,interpol:0,interv:0,invers:[2,6],isn:2,istft:3,its:2,join:5,jos:0,kaldi:[3,4],kaldi_io:3,kastnerkyl:6,keep:1,kei:4,kwarg:3,l56:0,lambda:3,last:[0,2],law:[2,6],learn:6,least:2,left:[0,2],len:5,length:[0,2,3,5,6],less:0,librosa:6,light:4,like:3,lim:2,linear:0,linearli:0,linearresampl:0,linspac:2,list:[3,5],listdir:5,load:[1,3,6],load_wav:3,log10:2,log:[0,6],log_mel:6,loss:2,lossi:3,low:0,low_freq:0,lowpass_filter_width:0,machin:6,magnitud:[0,2,6],magphas:3,mai:[2,6],master:0,match:0,math:2,matric:3,matrix:[2,4,6],max:[2,3],maximum:[2,6],mean:[0,2],mel:[0,2,6],mel_specgram:6,melkwarg:6,melscal:3,melspectrogram:3,metadata:[3,5],method:[1,2,6],mfc:[2,6],mfcc:3,millisecond:0,min_dur:0,minimum:[0,2,6],modifi:2,modul:4,mono:[0,5],more:[0,2,6],mp3:3,mu_law_decod:3,mu_law_encod:3,mulawdecod:3,mulawencod:3,mult:3,multipl:1,multipli:[2,3,6],multiprocess:1,must:[0,2,6],mydataset:5,n_fft:[2,6],n_frame:[2,6],n_freq:2,n_mel:[2,6],n_mfcc:[2,6],n_stft:6,name:5,need:[0,3,4,6],neg:[0,2,6],neighbor:2,never:2,new_freq:[0,6],nibbl:3,nola:2,none:[1,2,3,5,6],nonzero:2,norm:[2,6],normal:[0,2,3,5,6],note:3,nthread:1,num_cep:0,num_fram:3,num_freq:2,num_mel_bin:0,num_work:1,number:[0,2,3,5,6],numer:6,numeric_limit:0,nyquist:0,object:[3,5,6],occur:2,off:[0,2,6],offlinefeaturetpl:0,offset:[0,3],onc:3,one:[3,6],ones:2,onesid:2,onli:[0,3],open:4,oper:[0,2],opposite_endian:3,option:[0,1,2,3,5,6],orig_freq:[0,6],origin:[0,2,6],ortho:[2,6],other:0,out:[3,5],out_encinfo:5,out_siginfo:5,output:[0,1,2,3,5,6],overlap:2,packag:3,pad:[0,2,6],pad_mod:2,padded_window_s:0,parallelli:1,paramet:[0,1,2,3,4,5,6],pass:[1,3,5],path:[3,5],path_to_audio_fil:5,pathlib:3,per:3,perform:[0,1,2],phase:2,phase_adv:2,phase_vocod:3,piecewis:0,pil:1,pipe:4,pitch:2,point:[0,2],popular:3,possibl:3,povei:0,power:[0,2,6],practic:3,precis:3,preemphasi:0,preemphasis_coeffici:0,preprocess:5,prevent:2,primarili:3,print:3,process:[0,1,6],produc:0,provid:2,put:[0,1],python:[3,5,6],quantization_channel:[2,6],randn:2,rang:0,rate:[2,3,5,6],rather:3,raw:[0,1,2,5,6],raw_energi:0,read:4,reason:[2,6],recommend:0,rectangular:0,refer:[2,3],reflect:[0,2],rel:0,remov:2,remove_dc_offset:0,repres:0,requir:3,resampl:[0,3,5],resamplewaveform:0,resampling_method:6,respect:1,result:[2,3,5],retain:[2,6],revers:3,reverse_bit:3,reverse_byt:3,reverse_nibbl:3,right:[0,2,3],root:1,round:0,round_to_power_of_two:0,row:2,run:3,same:[2,3],sampl:[0,1,3,5,6],sample_frequ:0,sample_r:[3,6],save:3,save_encinfo:3,scale:[0,2,6],scp:4,second:0,see:[2,3,6],seg:3,segment:0,self:5,separ:2,sequenti:6,set:[0,3,5],set_input_fil:5,shape:[0,2,3],sharp:0,sharper:0,shift:[0,3],shorter:2,should:[0,2],showdown:3,shuffl:1,shutdown:3,shutdown_sox:[3,5],side:[2,6],sig:5,sign:[3,5],signal:[0,1,2,3,5,6],signal_length:2,signalinfo:3,similar:[0,1],simpl:3,sinc:[0,2],sinc_interpol:6,size:[0,2,3,5,6],slide:2,slow:2,snip_edg:0,snippet:[2,6],some:2,someth:5,sourc:[0,1,2,3,4,5,6],sox:[3,5],sox_bool:3,sox_build_flow_effect:5,sox_effect:3,sox_encoding_t:3,sox_encodinginfo_t:[3,5],sox_fals:3,sox_option_default:3,sox_option_t:3,sox_signalinfo_t:[3,5],soxeffect:3,soxeffectschain:3,space:0,spec_f:6,specgram:6,specgram_mel_db:6,specif:[2,3],specifi:[0,3],spectrogram:[1,3],spectrum:6,speech:6,speed:2,split:[2,6],squar:[2,6],src:[0,3],stabl:6,standard:3,stanford:0,start:3,std:0,stft:[2,6],stft_matrix:2,str:[0,1,2,3,4,5,6],stream:4,string:4,stype:6,subclass:1,subtract:0,subtract_mean:0,suffici:0,suggest:0,sum_:2,summat:2,suppos:2,take:1,target:1,target_transform:1,tensor:[0,1,2,3,4,5,6],test:[1,6],text:2,textbook:6,than:2,thei:[1,2,6],theory_ideal_bandlimited_interpol:0,thi:[0,2,3,4,6],time:[2,6],timsainb:6,togeth:6,top_db:[2,6],torch:[0,1,2,3,4,5,6],total:0,train:1,tran:2,transcript:1,transform:[1,2,3],triangular:[0,2,6],trim:2,tupl:[1,2,3,4,5],turn:[0,2,6],two:[0,1,2,6],type:[0,1,2,3,4,5,6],ulaw:3,unchang:[2,6],uniqu:3,unknown:3,unspecifi:3,upsampl:0,url:1,use:[0,2,3,4,6],use_energi:0,use_log_fbank:0,use_pow:0,used:[2,3],useful:[0,2],user:6,uses:[0,3,6],using:[1,2,6],util:1,valu:[2,3,6],variou:0,vctk:3,vector:3,version:1,vol:2,vtln:0,vtln_high:0,vtln_low:0,vtln_map:0,vtln_warp:0,wai:0,warn:0,warp:0,wav:[3,6],wave:3,waveform:[0,2,6],what:0,when:[2,3],where:[0,1,2,3,5,6],whether:[1,2,6],which:[0,1,2,3,4,5,6],whole:2,wikipedia:[2,6],win_length:[2,6],window:[0,2,6],window_fn:6,window_typ:0,wise:2,without:[2,3],wkwarg:6,worker:1,would:[0,2],wrapper:4,written:5,x_mu:[2,6],yesno:3,yesno_data:1,you:[0,3],zero:[0,2],zeroth:0},titles:["torchaudio.compliance.kaldi","torchaudio.datasets","torchaudio.functional","torchaudio","torchaudio.kaldi_io","torchaudio.sox_effects","torchaudio.transforms"],titleterms:{"function":[0,2],amplitude_to_db:2,amplitudetodb:6,angl:2,complex_norm:2,complianc:0,create_dct:2,create_fb_matrix:2,dataset:1,fbank:0,istft:2,kaldi:0,kaldi_io:4,magphas:2,matric:4,melscal:6,melspectrogram:6,mfcc:[0,6],mu_law_decod:2,mu_law_encod:2,mulawdecod:6,mulawencod:6,phase_vocod:2,read_mat_ark:4,read_mat_scp:4,read_vec_flt_ark:4,read_vec_flt_scp:4,read_vec_int_ark:4,resampl:6,resample_waveform:0,sox_effect:5,soxeffect:5,soxeffectschain:5,spectrogram:[0,2,6],torchaudio:[0,1,2,3,4,5,6],transform:6,vctk:1,vector:4,yesno:1}})
\ No newline at end of file
diff --git a/sox_effects.html b/sox_effects.html
index d5647bb1d2..31e2eecddd 100644
--- a/sox_effects.html
+++ b/sox_effects.html
@@ -265,7 +265,7 @@ SoxEffectsChain
- Return type
-Tuple[torch.Tensor, int]
+Tuple[torch.Tensor, int]
@@ -330,7 +330,7 @@ SoxEffectsChainBuild effects chain and flow effects from input file to output tensor
- Parameters
-out (torch.Tensor) – Where the output will be written to. (Default: None)
+out (torch.Tensor) – Where the output will be written to. (Default: None)
- Returns
An output Tensor of size [C x L] or [L x C] where L is the number
@@ -338,7 +338,7 @@
SoxEffectsChain
- Return type
-Tuple[torch.Tensor, int]
+Tuple[torch.Tensor, int]
diff --git a/transforms.html b/transforms.html
index 1719f81ae5..1f8d1a84da 100644
--- a/transforms.html
+++ b/transforms.html
@@ -218,7 +218,7 @@
torchaudio.transforms¶
-Transforms are common audio transforms. They can be chained together using torch.nn.Sequential
+Transforms are common audio transforms. They can be chained together using torch.nn.Sequential
Spectrogram¶
@@ -233,7 +233,7 @@ Spectrogramint, optional) – Length of hop between STFT windows. (
Default: win_length // 2)
pad (int) – Two sided padding of signal. (Default: 0)
-window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor
+
window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor
that is applied/multiplied to each frame/window. (Default: torch.hann_window)
power (int) – Exponent for the magnitude spectrogram,
(must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: 2)
@@ -247,7 +247,7 @@ Spectrogramforward(waveform)[source]¶
- Parameters
-waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
+waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
- Returns
Dimension (channel, freq, time), where channel
@@ -255,7 +255,7 @@
SpectrogramReturn type
--
+
-
@@ -289,13 +289,13 @@ AmplitudeToDBhttps://librosa.github.io/librosa/_modules/librosa/core/spectrum.html
- Parameters
-x (torch.Tensor) – Input tensor before being converted to decibel scale
+x (torch.Tensor) – Input tensor before being converted to decibel scale
- Returns
Output tensor in decibel scale
- Return type
--
+
-
@@ -328,13 +328,13 @@ MelScaleforward(specgram)[source]¶
- Parameters
-specgram (torch.Tensor) – A spectrogram STFT of dimension (channel, freq, time)
+specgram (torch.Tensor) – A spectrogram STFT of dimension (channel, freq, time)
- Returns
Mel frequency spectrogram of size (channel, n_mels, time)
- Return type
--
+
-
@@ -369,7 +369,7 @@ MelSpectrogramf_max (float, optional) – Maximum frequency. (Default: None)
pad (int) – Two sided padding of signal. (Default: 0)
n_mels (int) – Number of mel filterbanks. (Default: 128)
-window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor
+
window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor
that is applied/multiplied to each frame/window. (Default: torch.hann_window)
wkwargs (Dict[.., ..]) – Arguments for window function. (Default: None)
@@ -387,13 +387,13 @@ MelSpectrogramforward(waveform)[source]¶
- Parameters
-waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
+waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
- Returns
Mel frequency spectrogram of size (channel, n_mels, time)
- Return type
--
+
-
@@ -431,13 +431,13 @@ MFCCforward(waveform)[source]¶
- Parameters
-waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
+waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)
- Returns
specgram_mel_db of size (channel, n_mfcc, time)
- Return type
--
+
-
@@ -464,13 +464,13 @@ MuLawEncodingforward(x)[source]¶
- Parameters
-x (torch.Tensor) – A signal to be encoded
+x (torch.Tensor) – A signal to be encoded
- Returns
An encoded signal
- Return type
-x_mu (torch.Tensor)
+x_mu (torch.Tensor)
@@ -497,13 +497,13 @@ MuLawDecodingforward(x_mu)[source]¶
- Parameters
-x_mu (torch.Tensor) – A mu-law encoded signal which needs to be decoded
+x_mu (torch.Tensor) – A mu-law encoded signal which needs to be decoded
- Returns
The signal decoded
- Return type
--
+
-
@@ -532,13 +532,13 @@ Resampleforward(waveform)[source]¶
- Parameters
-waveform (torch.Tensor) – The input signal of dimension (channel, time)
+waveform (torch.Tensor) – The input signal of dimension (channel, time)
- Returns
Output signal of dimension (channel, time)
- Return type
--
+
-