diff --git a/_modules/torchaudio.html b/_modules/torchaudio.html index 53288ae382..1d6923ec6b 100644 --- a/_modules/torchaudio.html +++ b/_modules/torchaudio.html @@ -213,7 +213,7 @@

Source code for torchaudio

-from __future__ import division, print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 import os.path
 
 import torch
diff --git a/_modules/torchaudio/compliance/kaldi.html b/_modules/torchaudio/compliance/kaldi.html
index a9875c3b2b..41051037e8 100644
--- a/_modules/torchaudio/compliance/kaldi.html
+++ b/_modules/torchaudio/compliance/kaldi.html
@@ -215,19 +215,22 @@
              

Source code for torchaudio.compliance.kaldi

-import math
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
+import fractions
 import random
 import torch
-
+import torchaudio
 
 __all__ = [
-    'fbank',
     'get_mel_banks',
     'inverse_mel_scale',
     'inverse_mel_scale_scalar',
     'mel_scale',
     'mel_scale_scalar',
     'spectrogram',
+    'fbank',
+    'mfcc',
     'vtln_warp_freq',
     'vtln_warp_mel_freq',
     'resample_waveform',
@@ -332,7 +335,9 @@ 

Source code for torchaudio.compliance.kaldi

                                         frame_length, round_to_power_of_two, preemphasis_coefficient):
     r"""Gets the waveform and window properties
     """
-    waveform = waveform[max(channel, 0), :]  # size (n)
+    channel = max(channel, 0)
+    assert channel < waveform.size(0), ('Invalid channel %d for size %d' % (channel, waveform.size(0)))
+    waveform = waveform[channel, :]  # size (n)
     window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
     window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
     padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
@@ -397,6 +402,15 @@ 

Source code for torchaudio.compliance.kaldi

     return strided_input, signal_log_energy
 
 
+def _subtract_column_mean(tensor, subtract_mean):
+    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
+    # it returns size (m, n)
+    if subtract_mean:
+        col_means = torch.mean(tensor, dim=0).unsqueeze(0)
+        tensor = tensor - col_means
+    return tensor
+
+
 
[docs]def spectrogram( waveform, blackman_coeff=0.42, channel=-1, dither=1.0, energy_floor=0.0, frame_length=25.0, frame_shift=10.0, min_duration=0.0, @@ -454,10 +468,7 @@

Source code for torchaudio.compliance.kaldi

     power_spectrum = torch.max(fft.pow(2).sum(2), EPSILON).log()  # size (m, padded_window_size // 2 + 1)
     power_spectrum[:, 0] = signal_log_energy
 
-    if subtract_mean:
-        col_means = torch.mean(power_spectrum, dim=0).unsqueeze(0)  # size (1, padded_window_size // 2 + 1)
-        power_spectrum = power_spectrum - col_means
-
+    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
     return power_spectrum
@@ -719,7 +730,7 @@

Source code for torchaudio.compliance.kaldi

         # avoid log of zero (which should be prevented anyway by dithering)
         mel_energies = torch.max(mel_energies, EPSILON).log()
 
-    # if use_energy then add it as the first column for htk_compat == true else last column
+    # if use_energy then add it as the last column for htk_compat == true else first column
     if use_energy:
         signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
         # returns size (m, num_mel_bins + 1)
@@ -728,13 +739,134 @@ 

Source code for torchaudio.compliance.kaldi

         else:
             mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
 
-    if subtract_mean:
-        col_means = torch.mean(mel_energies, dim=0).unsqueeze(0)  # size (1, num_mel_bins + use_energy)
-        mel_energies = mel_energies - col_means
-
+    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
     return mel_energies
+def _get_dct_matrix(num_ceps, num_mel_bins): + # returns a dct matrix of size (num_mel_bins, num_ceps) + # size (num_mel_bins, num_mel_bins) + dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, 'ortho') + # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins) + # this would be the first column in the dct_matrix for torchaudio as it expects a + # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi + # expects a left multiply e.g. dct_matrix * vector). + dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins)) + dct_matrix = dct_matrix[:, :num_ceps] + return dct_matrix + + +def _get_lifter_coeffs(num_ceps, cepstral_lifter): + # returns size (num_ceps) + # Compute liftering coefficients (scaling on cepstral coeffs) + # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected. + i = torch.arange(num_ceps, dtype=torch.get_default_dtype()) + return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter) + + +
[docs]def mfcc( + waveform, blackman_coeff=0.42, cepstral_lifter=22.0, channel=-1, dither=1.0, + energy_floor=0.0, frame_length=25.0, frame_shift=10.0, high_freq=0.0, htk_compat=False, + low_freq=20.0, num_ceps=13, min_duration=0.0, num_mel_bins=23, preemphasis_coefficient=0.97, + raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, + sample_frequency=16000.0, snip_edges=True, subtract_mean=False, use_energy=False, + vtln_high=-500.0, vtln_low=100.0, vtln_warp=1.0, window_type=POVEY): + r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's + compute-mfcc-feats. + + Args: + waveform (torch.Tensor): Tensor of audio of size (c, n) where c is in the range [0,2) + blackman_coeff (float): Constant coefficient for generalized Blackman window. (Default: ``0.42``) + cepstral_lifter (float): Constant that controls scaling of MFCCs (Default: ``22.0``) + channel (int): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``) + dither (float): Dithering constant (0.0 means no dither). If you turn this off, you should set + the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``1.0``) + energy_floor (float): Floor on energy (absolute, not relative) in Spectrogram computation. Caution: + this floor is applied to the zeroth component, representing the total signal energy. The floor on the + individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``0.0``) + frame_length (float): Frame length in milliseconds (Default: ``25.0``) + frame_shift (float): Frame shift in milliseconds (Default: ``10.0``) + high_freq (float): High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (Default: ``0.0``) + htk_compat (bool): If true, put energy last. Warning: not sufficient to get HTK compatible features (need + to change other parameters). (Default: ``False``) + low_freq (float): Low cutoff frequency for mel bins (Default: ``20.0``) + num_ceps (int): Number of cepstra in MFCC computation (including C0) (Default: ``13``) + min_duration (float): Minimum duration of segments to process (in seconds). (Default: ``0.0``) + num_mel_bins (int): Number of triangular mel-frequency bins (Default: ``23``) + preemphasis_coefficient (float): Coefficient for use in signal preemphasis (Default: ``0.97``) + raw_energy (bool): If True, compute energy before preemphasis and windowing (Default: ``True``) + remove_dc_offset: Subtract mean from waveform on each frame (Default: ``True``) + round_to_power_of_two (bool): If True, round window size to power of two by zero-padding input + to FFT. (Default: ``True``) + sample_frequency (float): Waveform data sample frequency (must match the waveform file, if + specified there) (Default: ``16000.0``) + snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit + in the file, and the number of frames depends on the frame_length. If False, the number of frames + depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``) + subtract_mean (bool): Subtract mean of each feature file [CMS]; not recommended to do + it this way. (Default: ``False``) + use_energy (bool): Add an extra dimension with energy to the FBANK output. (Default: ``False``) + vtln_high (float): High inflection point in piecewise linear VTLN warping function (if + negative, offset from high-mel-freq (Default: ``-500.0``) + vtln_low (float): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``) + vtln_warp (float): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``) + window_type (str): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman') (Default: ``'povey'``) + + Returns: + torch.Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``) + where m is calculated in _get_strided + """ + assert num_ceps <= num_mel_bins, 'num_ceps cannot be larger than num_mel_bins: %d vs %d' % (num_ceps, num_mel_bins) + + # The mel_energies should not be squared (use_power=True), not have mean subtracted + # (subtract_mean=False), and use log (use_log_fbank=True). + # size (m, num_mel_bins + use_energy) + feature = fbank(waveform=waveform, blackman_coeff=blackman_coeff, channel=channel, + dither=dither, energy_floor=energy_floor, frame_length=frame_length, + frame_shift=frame_shift, high_freq=high_freq, htk_compat=htk_compat, + low_freq=low_freq, min_duration=min_duration, num_mel_bins=num_mel_bins, + preemphasis_coefficient=preemphasis_coefficient, raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, round_to_power_of_two=round_to_power_of_two, + sample_frequency=sample_frequency, snip_edges=snip_edges, subtract_mean=False, + use_energy=use_energy, use_log_fbank=True, use_power=True, + vtln_high=vtln_high, vtln_low=vtln_low, vtln_warp=vtln_warp, window_type=window_type) + + if use_energy: + # size (m) + signal_log_energy = feature[:, num_mel_bins if htk_compat else 0] + # offset is 0 if htk_compat==True else 1 + mel_offset = int(not htk_compat) + feature = feature[:, mel_offset:(num_mel_bins + mel_offset)] + + # size (num_mel_bins, num_ceps) + dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins) + + # size (m, num_ceps) + feature = feature.matmul(dct_matrix) + + if cepstral_lifter != 0.0: + # size (1, num_ceps) + lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0) + feature *= lifter_coeffs + + # if use_energy then replace the last column for htk_compat == true else first column + if use_energy: + feature[:, 0] = signal_log_energy + + if htk_compat: + energy = feature[:, 0].unsqueeze(1) # size (m, 1) + feature = feature[:, 1:] # size (m, num_ceps - 1) + if not use_energy: + # scale on C0 (actually removing a scale we previously added that's + # part of one common definition of the cosine transform.) + energy *= math.sqrt(2) + + feature = torch.cat((feature, energy), dim=1) + + feature = _subtract_column_mean(feature, subtract_mean) + return feature
+ + def _get_LR_indices_and_weights(orig_freq, new_freq, output_samples_in_unit, window_width, lowpass_cutoff, lowpass_filter_width): r"""Based on LinearResample::SetIndexesAndWeights where it retrieves the weights for @@ -817,7 +949,7 @@

Source code for torchaudio.compliance.kaldi

 
 
 def _lcm(a, b):
-    return abs(a * b) // math.gcd(a, b)
+    return abs(a * b) // fractions.gcd(a, b)
 
 
 def _get_num_LR_output_samples(input_num_samp, samp_rate_in, samp_rate_out):
@@ -892,7 +1024,7 @@ 

Source code for torchaudio.compliance.kaldi

 
     assert lowpass_cutoff * 2 <= min_freq
 
-    base_freq = math.gcd(int(orig_freq), int(new_freq))
+    base_freq = fractions.gcd(int(orig_freq), int(new_freq))
     input_samples_in_unit = int(orig_freq) // base_freq
     output_samples_in_unit = int(new_freq) // base_freq
 
diff --git a/_modules/torchaudio/datasets/vctk.html b/_modules/torchaudio/datasets/vctk.html
index 6a05437da8..147c4bb4cf 100644
--- a/_modules/torchaudio/datasets/vctk.html
+++ b/_modules/torchaudio/datasets/vctk.html
@@ -215,7 +215,7 @@
              

Source code for torchaudio.datasets.vctk

-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 import torch.utils.data as data
 import os
 import os.path
diff --git a/_modules/torchaudio/datasets/yesno.html b/_modules/torchaudio/datasets/yesno.html
index 895158770a..e773982335 100644
--- a/_modules/torchaudio/datasets/yesno.html
+++ b/_modules/torchaudio/datasets/yesno.html
@@ -215,7 +215,7 @@
              

Source code for torchaudio.datasets.yesno

-from __future__ import print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 import torch.utils.data as data
 import os
 import os.path
diff --git a/_modules/torchaudio/functional.html b/_modules/torchaudio/functional.html
index 6144c36350..92edd060a9 100644
--- a/_modules/torchaudio/functional.html
+++ b/_modules/torchaudio/functional.html
@@ -215,7 +215,8 @@
              

Source code for torchaudio.functional

-import math
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
 import torch
 
 
@@ -278,8 +279,8 @@ 

Source code for torchaudio.functional

     could be useful. If ``length`` is ``None`` then padding will be aggressively removed
     (some loss of signal).
 
-    [1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,”
-    IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.
+    [1] D. W. Griffin and J. S. Lim, "Signal estimation from modified short-time Fourier transform,"
+    IEEE Trans. ASSP, vol.32, no.2, pp.236-243, Apr. 1984.
 
     Args:
         stft_matrix (torch.Tensor): Output of stft where each row of a channel is a frequency and each
@@ -312,6 +313,7 @@ 

Source code for torchaudio.functional

         # add a channel dimension
         stft_matrix = stft_matrix.unsqueeze(0)
 
+    dtype = stft_matrix.dtype
     device = stft_matrix.device
     fft_size = stft_matrix.size(1)
     assert (onesided and n_fft // 2 + 1 == fft_size) or (not onesided and n_fft == fft_size), (
@@ -330,7 +332,7 @@ 

Source code for torchaudio.functional

     assert 0 < win_length <= n_fft
 
     if window is None:
-        window = torch.ones(win_length)
+        window = torch.ones(win_length, requires_grad=False, device=device, dtype=dtype)
 
     assert window.dim() == 1 and window.size(0) == win_length
 
@@ -353,7 +355,7 @@ 

Source code for torchaudio.functional

     ytmp = ytmp.transpose(1, 2)  # size (channel, n_fft, n_frames)
 
     eye = torch.eye(n_fft, requires_grad=False,
-                    device=device).unsqueeze(1)  # size (n_fft, 1, n_fft)
+                    device=device, dtype=dtype).unsqueeze(1)  # size (n_fft, 1, n_fft)
 
     # this does overlap add where the frames of ytmp are added such that the i'th frame of
     # ytmp is added starting at i*hop_length in the output
diff --git a/_modules/torchaudio/kaldi_io.html b/_modules/torchaudio/kaldi_io.html
index 625c2f645b..1b0368f1dd 100644
--- a/_modules/torchaudio/kaldi_io.html
+++ b/_modules/torchaudio/kaldi_io.html
@@ -215,7 +215,8 @@
              

Source code for torchaudio.kaldi_io

-# To use this file, the dependency (https://github.com/vesis84/kaldi-io-for-python)
+from __future__ import absolute_import, division, print_function, unicode_literals
+# To use this file, the dependency (https://github.com/vesis84/kaldi-io-for-python)
 # needs to be installed. This is a light wrapper around kaldi_io that returns
 # torch.Tensors.
 import torch
diff --git a/_modules/torchaudio/sox_effects.html b/_modules/torchaudio/sox_effects.html
index 88ebdfc210..fd5ce7d101 100644
--- a/_modules/torchaudio/sox_effects.html
+++ b/_modules/torchaudio/sox_effects.html
@@ -215,7 +215,7 @@
              

Source code for torchaudio.sox_effects

-from __future__ import division, print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 import torch
 import _torch_sox
 
diff --git a/_modules/torchaudio/transforms.html b/_modules/torchaudio/transforms.html
index 28754fddbb..4a4817e32d 100644
--- a/_modules/torchaudio/transforms.html
+++ b/_modules/torchaudio/transforms.html
@@ -215,7 +215,7 @@
              

Source code for torchaudio.transforms

-from __future__ import division, print_function
+from __future__ import absolute_import, division, print_function, unicode_literals
 from warnings import warn
 import math
 import torch
diff --git a/_sources/compliance.kaldi.rst.txt b/_sources/compliance.kaldi.rst.txt
index 1dfee29eb1..cc75021d69 100644
--- a/_sources/compliance.kaldi.rst.txt
+++ b/_sources/compliance.kaldi.rst.txt
@@ -15,15 +15,20 @@ produce similar outputs.
 Functions
 ---------
 
+:hidden:`spectrogram`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: spectrogram
+
 :hidden:`fbank`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autofunction:: fbank
 
-:hidden:`spectrogram`
+:hidden:`mfcc`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autofunction:: spectrogram
+.. autofunction:: mfcc
 
 :hidden:`resample_waveform`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/compliance.kaldi.html b/compliance.kaldi.html
index 54af640bbc..634ab9693e 100644
--- a/compliance.kaldi.html
+++ b/compliance.kaldi.html
@@ -223,6 +223,53 @@ 

torchaudio.compliance.kaldi

Functions

+
+

spectrogram

+
+
+torchaudio.compliance.kaldi.spectrogram(waveform, blackman_coeff=0.42, channel=-1, dither=1.0, energy_floor=0.0, frame_length=25.0, frame_shift=10.0, min_duration=0.0, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True, subtract_mean=False, window_type='povey')[source]
+

Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi’s +compute-spectrogram-feats.

+
+
Parameters
+
    +
  • waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)

  • +
  • blackman_coeff (float) – Constant coefficient for generalized Blackman window. (Default: 0.42)

  • +
  • channel (int) – Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: -1)

  • +
  • dither (float) – Dithering constant (0.0 means no dither). If you turn this off, you should set +the energy_floor option, e.g. to 1.0 or 0.1 (Default: 1.0)

  • +
  • energy_floor (float) – Floor on energy (absolute, not relative) in Spectrogram computation. Caution: +this floor is applied to the zeroth component, representing the total signal energy. The floor on the +individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: 0.0)

  • +
  • frame_length (float) – Frame length in milliseconds (Default: 25.0)

  • +
  • frame_shift (float) – Frame shift in milliseconds (Default: 10.0)

  • +
  • min_duration (float) – Minimum duration of segments to process (in seconds). (Default: 0.0)

  • +
  • preemphasis_coefficient (float) – Coefficient for use in signal preemphasis (Default: 0.97)

  • +
  • raw_energy (bool) – If True, compute energy before preemphasis and windowing (Default: True)

  • +
  • remove_dc_offset – Subtract mean from waveform on each frame (Default: True)

  • +
  • round_to_power_of_two (bool) – If True, round window size to power of two by zero-padding input +to FFT. (Default: True)

  • +
  • sample_frequency (float) – Waveform data sample frequency (must match the waveform file, if +specified there) (Default: 16000.0)

  • +
  • snip_edges (bool) – If True, end effects will be handled by outputting only frames that completely fit +in the file, and the number of frames depends on the frame_length. If False, the number of frames +depends only on the frame_shift, and we reflect the data at the ends. (Default: True)

  • +
  • subtract_mean (bool) – Subtract mean of each feature file [CMS]; not recommended to do +it this way. (Default: False)

  • +
  • window_type (str) – Type of window (‘hamming’|’hanning’|’povey’|’rectangular’|’blackman’) (Default: 'povey')

  • +
+
+
Returns
+

A spectrogram identical to what Kaldi would output. The shape is +(m, padded_window_size // 2 + 1) where m is calculated in _get_strided

+
+
Return type
+

torch.Tensor

+
+
+
+ +

fbank

@@ -233,7 +280,7 @@

fbank
Parameters
    -
  • waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)

  • +
  • waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)

  • blackman_coeff (float) – Constant coefficient for generalized Blackman window. (Default: 0.42)

  • channel (int) – Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: -1)

  • dither (float) – Dithering constant (0.0 means no dither). If you turn this off, you should set @@ -276,24 +323,25 @@

    fbank

Return type
-

torch.Tensor

+

torch.Tensor

-
-

spectrogram

+
+

mfcc

-
-torchaudio.compliance.kaldi.spectrogram(waveform, blackman_coeff=0.42, channel=-1, dither=1.0, energy_floor=0.0, frame_length=25.0, frame_shift=10.0, min_duration=0.0, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True, subtract_mean=False, window_type='povey')[source]
-

Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi’s -compute-spectrogram-feats.

+
+torchaudio.compliance.kaldi.mfcc(waveform, blackman_coeff=0.42, cepstral_lifter=22.0, channel=-1, dither=1.0, energy_floor=0.0, frame_length=25.0, frame_shift=10.0, high_freq=0.0, htk_compat=False, low_freq=20.0, num_ceps=13, min_duration=0.0, num_mel_bins=23, preemphasis_coefficient=0.97, raw_energy=True, remove_dc_offset=True, round_to_power_of_two=True, sample_frequency=16000.0, snip_edges=True, subtract_mean=False, use_energy=False, vtln_high=-500.0, vtln_low=100.0, vtln_warp=1.0, window_type='povey')[source]
+

Create a mfcc from a raw audio signal. This matches the input/output of Kaldi’s +compute-mfcc-feats.

Parameters
    -
  • waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)

  • +
  • waveform (torch.Tensor) – Tensor of audio of size (c, n) where c is in the range [0,2)

  • blackman_coeff (float) – Constant coefficient for generalized Blackman window. (Default: 0.42)

  • +
  • cepstral_lifter (float) – Constant that controls scaling of MFCCs (Default: 22.0)

  • channel (int) – Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: -1)

  • dither (float) – Dithering constant (0.0 means no dither). If you turn this off, you should set the energy_floor option, e.g. to 1.0 or 0.1 (Default: 1.0)

  • @@ -302,7 +350,13 @@

    spectrogram0.0)

  • frame_length (float) – Frame length in milliseconds (Default: 25.0)

  • frame_shift (float) – Frame shift in milliseconds (Default: 10.0)

  • +
  • high_freq (float) – High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (Default: 0.0)

  • +
  • htk_compat (bool) – If true, put energy last. Warning: not sufficient to get HTK compatible features (need +to change other parameters). (Default: False)

  • +
  • low_freq (float) – Low cutoff frequency for mel bins (Default: 20.0)

  • +
  • num_ceps (int) – Number of cepstra in MFCC computation (including C0) (Default: 13)

  • min_duration (float) – Minimum duration of segments to process (in seconds). (Default: 0.0)

  • +
  • num_mel_bins (int) – Number of triangular mel-frequency bins (Default: 23)

  • preemphasis_coefficient (float) – Coefficient for use in signal preemphasis (Default: 0.97)

  • raw_energy (bool) – If True, compute energy before preemphasis and windowing (Default: True)

  • remove_dc_offset – Subtract mean from waveform on each frame (Default: True)

  • @@ -315,15 +369,20 @@

    spectrogramTrue)

  • subtract_mean (bool) – Subtract mean of each feature file [CMS]; not recommended to do it this way. (Default: False)

  • +
  • use_energy (bool) – Add an extra dimension with energy to the FBANK output. (Default: False)

  • +
  • vtln_high (float) – High inflection point in piecewise linear VTLN warping function (if +negative, offset from high-mel-freq (Default: -500.0)

  • +
  • vtln_low (float) – Low inflection point in piecewise linear VTLN warping function (Default: 100.0)

  • +
  • vtln_warp (float) – Vtln warp factor (only applicable if vtln_map not specified) (Default: 1.0)

  • window_type (str) – Type of window (‘hamming’|’hanning’|’povey’|’rectangular’|’blackman’) (Default: 'povey')

Returns
-

A spectrogram identical to what Kaldi would output. The shape is -(m, padded_window_size // 2 + 1) where m is calculated in _get_strided

+

A mfcc identical to what Kaldi would output. The shape is (m, num_ceps) +where m is calculated in _get_strided

Return type
-

torch.Tensor

+

torch.Tensor

@@ -344,7 +403,7 @@

resample_waveform
Parameters
Return type
-

torch.Tensor

+

torch.Tensor

@@ -409,8 +468,9 @@

resample_waveform
  • torchaudio.compliance.kaldi
      +
    • mfcc() (in module torchaudio.compliance.kaldi) +
    • mu_law_decoding() (in module torchaudio.functional)
    • mu_law_encoding() (in module torchaudio.functional) diff --git a/index.html b/index.html index fc535b8f52..39731b8a8e 100644 --- a/index.html +++ b/index.html @@ -364,7 +364,7 @@

      torchaudioParameters
      @@ -436,7 +436,7 @@

      torchaudioParameters
      Return type
      -

      Generator[str, torch.Tensor]

      +

      Generator[str, torch.Tensor]

      @@ -290,7 +290,7 @@

      read_vec_flt_ark

      The string is the key and the tensor is the vector read from file

      Return type
      -

      Generator[str, torch.Tensor]

      +

      Generator[str, torch.Tensor]

      @@ -320,7 +320,7 @@

      read_mat_scp

      The string is the key and the tensor is the matrix read from file

      Return type
      -

      Generator[str, torch.Tensor]

      +

      Generator[str, torch.Tensor]

      @@ -347,7 +347,7 @@

      read_mat_ark

      The string is the key and the tensor is the matrix read from file

      Return type
      -

      Generator[str, torch.Tensor]

      +

      Generator[str, torch.Tensor]

      diff --git a/objects.inv b/objects.inv index 72131abc16..cd2e8b74b7 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/searchindex.js b/searchindex.js index 6b9aa981fc..063bad04fc 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({docnames:["compliance.kaldi","datasets","functional","index","kaldi_io","sox_effects","transforms"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["compliance.kaldi.rst","datasets.rst","functional.rst","index.rst","kaldi_io.rst","sox_effects.rst","transforms.rst"],objects:{"":{torchaudio:[3,0,0,"-"]},"torchaudio._docs.AmplitudeToDB":{forward:[6,1,1,""]},"torchaudio._docs.MFCC":{forward:[6,1,1,""]},"torchaudio._docs.MelScale":{forward:[6,1,1,""]},"torchaudio._docs.MelSpectrogram":{forward:[6,1,1,""]},"torchaudio._docs.MuLawDecoding":{forward:[6,1,1,""]},"torchaudio._docs.MuLawEncoding":{forward:[6,1,1,""]},"torchaudio._docs.Resample":{forward:[6,1,1,""]},"torchaudio._docs.Spectrogram":{forward:[6,1,1,""]},"torchaudio.compliance.kaldi":{fbank:[0,2,1,""],resample_waveform:[0,2,1,""],spectrogram:[0,2,1,""]},"torchaudio.datasets":{VCTK:[1,3,1,""],YESNO:[1,3,1,""]},"torchaudio.datasets.VCTK":{__getitem__:[1,1,1,""]},"torchaudio.datasets.YESNO":{__getitem__:[1,1,1,""]},"torchaudio.functional":{amplitude_to_DB:[2,2,1,""],angle:[2,2,1,""],complex_norm:[2,2,1,""],create_dct:[2,2,1,""],create_fb_matrix:[2,2,1,""],istft:[2,2,1,""],magphase:[2,2,1,""],mu_law_decoding:[2,2,1,""],mu_law_encoding:[2,2,1,""],phase_vocoder:[2,2,1,""],spectrogram:[2,2,1,""]},"torchaudio.kaldi_io":{read_mat_ark:[4,2,1,""],read_mat_scp:[4,2,1,""],read_vec_flt_ark:[4,2,1,""],read_vec_flt_scp:[4,2,1,""],read_vec_int_ark:[4,2,1,""]},"torchaudio.sox_effects":{SoxEffect:[5,3,1,""],SoxEffectsChain:[5,3,1,""]},"torchaudio.sox_effects.SoxEffectsChain":{append_effect_to_chain:[5,1,1,""],clear_chain:[5,1,1,""],set_input_file:[5,1,1,""],sox_build_flow_effects:[5,1,1,""]},"torchaudio.transforms":{AmplitudeToDB:[6,3,1,""],MFCC:[6,3,1,""],MelScale:[6,3,1,""],MelSpectrogram:[6,3,1,""],MuLawDecoding:[6,3,1,""],MuLawEncoding:[6,3,1,""],Resample:[6,3,1,""],Spectrogram:[6,3,1,""]},torchaudio:{get_sox_bool:[3,2,1,""],get_sox_encoding_t:[3,2,1,""],get_sox_option_t:[3,2,1,""],info:[3,2,1,""],initialize_sox:[3,2,1,""],load:[3,2,1,""],load_wav:[3,2,1,""],save:[3,2,1,""],save_encinfo:[3,2,1,""],shutdown_sox:[3,2,1,""],sox_encodinginfo_t:[3,2,1,""],sox_signalinfo_t:[3,2,1,""]}},objnames:{"0":["py","module","Python module"],"1":["py","method","Python method"],"2":["py","function","Python function"],"3":["py","class","Python class"]},objtypes:{"0":"py:module","1":"py:method","2":"py:function","3":"py:class"},terms:{"16000hz":5,"179d6e9a88202ab0a2f":6,"boolean":[3,5],"byte":3,"class":[1,5,6],"default":[0,1,2,3,5,6],"enum":3,"final":3,"float":[0,2,3,6],"function":[1,3,5,6],"import":2,"int":[0,1,2,3,4,5,6],"new":0,"return":[0,1,2,3,4,5,6],"short":2,"true":[0,1,2,3,5,6],CMS:0,For:[1,2,3,6],Not:3,The:[0,1,2,3,4,5,6],These:2,Use:2,Useful:1,__getitem__:[1,5],__init__:5,__len__:[1,5],__members__:3,_get_strid:0,_length:2,_modul:6,abs:3,absolut:0,accord:4,add:0,addit:2,advanc:2,after:[2,3,6],again:1,aggress:2,algorithm:[2,6],all:[1,2,3],almost:1,alreadi:1,altern:1,amin:2,amount:2,amplitud:[2,6],amplitude_to_db:3,amplitudetodb:3,angl:3,anoth:6,api:1,append:5,append_effect_to_chain:5,appli:[0,2,6],applic:0,apr:2,arg:1,argument:[1,6],ark:4,around:[0,4],asr:0,assp:2,assum:[2,3,5,6],attempt:3,attribut:5,audio:[0,1,2,3,5,6],audiodir_path:5,automat:[3,5],avail:1,bandlimit:0,bank:[2,6],base:[2,6],batch:2,batch_siz:1,becaus:2,been:[2,6],befor:[0,2,6],begin:3,being:[2,6],between:[2,5,6],bin:[0,2,6],bit:[3,5],bits_per_sampl:3,blackman:0,blackman_coeff:0,blob:0,bool:[0,1,2,3,5,6],both:2,build:5,built:6,calcul:[0,2,6],call:3,callabl:[1,3,5,6],can:[0,1,2,3,6],cancel:2,cannot:[2,3,5],caution:0,ccrma:0,ceil:2,center:2,cepstrum:6,certain:2,chain:[3,5,6],chang:[0,3],channel:[0,2,3,5,6],channels_first:[3,5],check:2,choos:3,clamp:2,clean:1,clear:5,clear_chain:5,clip:[2,6],coeffici:[0,2,6],column:2,com:[0,6],common:[1,2,3,6],compand:[2,6],compat:0,complet:0,complex:2,complex_norm:3,complex_specgram:2,complex_specgrams_stretch:2,complex_tensor:2,complianc:3,compon:0,composit:6,compress:3,comput:[0,2],condit:2,consider:2,consist:[3,6],constant:[0,2],control:[0,2,6],conveni:3,convers:[2,6],convert:[2,6],core:6,cosin:6,could:[2,3,5],creat:[0,2,3,4,5,6],create_dct:3,create_fb_matrix:3,cut:[2,6],cutoff:0,data:[0,1,2,3,5],data_load:1,data_vol_norm:3,dataload:1,dataset:[3,5],db_multipli:2,dct:[2,6],dct_type:6,decibel:[2,6],decod:[2,6],def:5,depend:[0,2,4,6],descriptor:4,desir:[0,6],determin:[3,5],dev_mod:1,devic:6,dict:[3,6],dictionari:4,differ:[2,6],dimens:[0,2,6],directori:1,discard:2,discret:6,disk:3,distanc:2,dither:0,divid:[3,5],down:2,download:1,downsampl:[0,1],durat:0,each:[0,2,3,6],earg:5,edu:0,effect:[0,3,5],effici:0,either:2,element:[0,2],elementwis:6,els:0,enam:5,encod:[2,3,6],encodinginfo:3,end:[0,2],endian:3,energi:[0,2,6],energy_floor:0,entri:[2,6],envelop:2,eopt:5,epsilon:0,estim:2,etc:[2,3,6],everyth:3,exactli:2,exampl:[1,2,3,4,5,6],exist:1,expect:[0,2,6],expon:[2,6],extens:[3,5],extra:0,extract:0,f_max:[2,6],f_min:[2,6],factor:[0,2,3],fals:[0,1,2,6],fault:3,feat:0,featur:0,fft:[0,2,6],fft_size:2,file:[0,1,3,4,5],file_or_fd:4,filedescriptor:4,filepath:3,filetyp:[3,5],filter:[0,2,6],filterbank:[0,2,6],finish:3,first:[3,5,6],fit:[0,2],fix:0,float32:4,float64:4,floor:0,flow:5,follow:[1,5],foo:3,format:3,forward:6,fourier:[2,6],frame:[0,2,3,5,6],frame_length:0,frame_shift:0,freq:[0,2,6],frequenc:[0,2,6],from:[0,1,2,3,4,5,6],full:[2,6],gener:[0,4],get:[0,3],get_sox_bool:3,get_sox_encoding_t:3,get_sox_option_t:3,gist:6,github:[0,6],give:6,given:[0,2,3,5,6],griffin:2,gzip:4,ham:0,han:0,handl:0,hann_window:6,has:[0,2,6],have:1,haythamfayek:6,headroom:3,hebrew:1,help:[3,5],henc:1,here:[5,6],high:0,high_freq:0,highlight:2,hop:[2,6],hop_length:[2,6],htk:0,htk_compat:0,html:[0,6],http:[0,6],ident:0,ieee:2,imag:1,implement:[1,6],importantli:3,index:[1,5],individu:0,inflect:0,info:[2,3,6],inform:[2,5],infti:2,initi:3,initialize_sox:[3,5],input:[0,1,2,3,5,6],input_fil:5,instal:4,instead:[3,6],integ:[3,5],internet:1,interpol:0,interv:0,invers:[2,6],isn:2,istft:3,its:2,join:5,jos:0,kaldi:[3,4],kaldi_io:3,kastnerkyl:6,keep:1,kei:4,kwarg:3,l56:0,lambda:3,last:[0,2],law:[2,6],learn:6,least:2,left:[0,2],len:5,length:[0,2,3,5,6],less:0,librosa:6,light:4,like:3,lim:2,linear:0,linearli:0,linearresampl:0,linspac:2,list:[3,5],listdir:5,load:[1,3,6],load_wav:3,log10:2,log:[0,6],log_mel:6,loss:2,lossi:3,low:0,low_freq:0,lowpass_filter_width:0,machin:6,magnitud:[0,2,6],magphas:3,mai:[2,6],master:0,match:0,math:2,matric:3,matrix:[2,4,6],max:[2,3],maximum:[2,6],mean:[0,2],mel:[0,2,6],mel_specgram:6,melkwarg:6,melscal:3,melspectrogram:3,metadata:[3,5],method:[1,2,6],mfc:[2,6],mfcc:3,millisecond:0,min_dur:0,minimum:[0,2,6],modifi:2,modul:4,mono:[0,5],more:[0,2,6],mp3:3,mu_law_decod:3,mu_law_encod:3,mulawdecod:3,mulawencod:3,mult:3,multipl:1,multipli:[2,3,6],multiprocess:1,must:[0,2,6],mydataset:5,n_fft:[2,6],n_frame:[2,6],n_freq:2,n_mel:[2,6],n_mfcc:[2,6],n_stft:6,name:5,need:[0,3,4,6],neg:[0,2,6],neighbor:2,never:2,new_freq:[0,6],nibbl:3,nola:2,none:[1,2,3,5,6],nonzero:2,norm:[2,6],normal:[0,2,3,5,6],note:3,nthread:1,num_fram:3,num_freq:2,num_mel_bin:0,num_work:1,number:[0,2,3,5,6],numer:6,numeric_limit:0,nyquist:0,object:[3,5,6],occur:2,off:[0,2,6],offlinefeaturetpl:0,offset:[0,3],onc:3,one:[3,6],ones:2,onesid:2,onli:[0,3],open:4,oper:[0,2],opposite_endian:3,option:[0,1,2,3,5,6],orig_freq:[0,6],origin:[0,2,6],ortho:[2,6],other:0,out:[3,5],out_encinfo:5,out_siginfo:5,output:[0,1,2,3,5,6],overlap:2,packag:3,pad:[0,2,6],pad_mod:2,padded_window_s:0,parallelli:1,paramet:[0,1,2,3,4,5,6],pass:[1,3,5],path:[3,5],path_to_audio_fil:5,pathlib:3,per:3,perform:[0,1,2],phase:2,phase_adv:2,phase_vocod:3,piecewis:0,pil:1,pipe:4,pitch:2,point:[0,2],popular:3,possibl:3,povei:0,power:[0,2,6],practic:3,precis:3,preemphasi:0,preemphasis_coeffici:0,preprocess:5,prevent:2,primarili:3,print:3,process:[0,1,6],produc:0,provid:2,put:[0,1],python:[3,5,6],quantization_channel:[2,6],randn:2,rang:0,rate:[2,3,5,6],rather:3,raw:[0,1,2,5,6],raw_energi:0,read:4,reason:[2,6],recommend:0,rectangular:0,refer:[2,3],reflect:[0,2],rel:0,remov:2,remove_dc_offset:0,repres:0,requir:3,resampl:[0,3,5],resamplewaveform:0,resampling_method:6,respect:1,result:[2,3,5],retain:[2,6],revers:3,reverse_bit:3,reverse_byt:3,reverse_nibbl:3,right:[0,2,3],root:1,round:0,round_to_power_of_two:0,row:2,run:3,same:[2,3],sampl:[0,1,3,5,6],sample_frequ:0,sample_r:[3,6],save:3,save_encinfo:3,scale:[2,6],scp:4,second:0,see:[2,3,6],seg:3,segment:0,self:5,separ:2,sequenti:6,set:[0,3,5],set_input_fil:5,shape:[0,2,3],sharp:0,sharper:0,shift:[0,3],shorter:2,should:[0,2],showdown:3,shuffl:1,shutdown:3,shutdown_sox:[3,5],side:[2,6],sig:5,sign:[3,5],signal:[0,1,2,3,5,6],signal_length:2,signalinfo:3,similar:[0,1],simpl:3,sinc:[0,2],sinc_interpol:6,size:[0,2,3,5,6],slide:2,slow:2,snip_edg:0,snippet:[2,6],some:2,someth:5,sourc:[0,1,2,3,4,5,6],sox:[3,5],sox_bool:3,sox_build_flow_effect:5,sox_effect:3,sox_encoding_t:3,sox_encodinginfo_t:[3,5],sox_fals:3,sox_option_default:3,sox_option_t:3,sox_signalinfo_t:[3,5],soxeffect:3,soxeffectschain:3,space:0,spec_f:6,specgram:6,specgram_mel_db:6,specif:[2,3],specifi:[0,3],spectrogram:[1,3],spectrum:6,speech:6,speed:2,split:[2,6],squar:[2,6],src:[0,3],stabl:6,standard:3,stanford:0,start:3,std:0,stft:[2,6],stft_matrix:2,str:[0,1,2,3,4,5,6],stream:4,string:4,stype:6,subclass:1,subtract:0,subtract_mean:0,suffici:0,suggest:0,sum_:2,summat:2,suppos:2,take:1,target:1,target_transform:1,tensor:[0,1,2,3,4,5,6],test:[1,6],text:2,textbook:6,than:2,thei:[1,2,6],theory_ideal_bandlimited_interpol:0,thi:[0,2,3,4,6],time:[2,6],timsainb:6,togeth:6,top_db:[2,6],torch:[0,1,2,3,4,5,6],total:0,train:1,tran:2,transcript:1,transform:[1,2,3],triangular:[0,2,6],trim:2,tupl:[1,2,3,4,5],turn:[0,2,6],two:[0,1,2,6],type:[0,1,2,3,4,5,6],ulaw:3,unchang:[2,6],uniqu:3,unknown:3,unspecifi:3,upsampl:0,url:1,use:[0,2,3,4,6],use_energi:0,use_log_fbank:0,use_pow:0,used:[2,3],useful:[0,2],user:6,uses:[0,3,6],using:[1,2,6],util:1,valu:[2,3,6],variou:0,vctk:3,vector:3,version:1,vol:2,vtln:0,vtln_high:0,vtln_low:0,vtln_map:0,vtln_warp:0,wai:0,warn:0,warp:0,wav:[3,6],wave:3,waveform:[0,2,6],what:0,when:[2,3],where:[0,1,2,3,5,6],whether:[1,2,6],which:[0,1,2,3,4,5,6],whole:2,wikipedia:[2,6],win_length:[2,6],window:[0,2,6],window_fn:6,window_typ:0,wise:2,without:[2,3],wkwarg:6,worker:1,would:[0,2],wrapper:4,written:5,x_mu:[2,6],yesno:3,yesno_data:1,you:[0,3],zero:[0,2],zeroth:0},titles:["torchaudio.compliance.kaldi","torchaudio.datasets","torchaudio.functional","torchaudio","torchaudio.kaldi_io","torchaudio.sox_effects","torchaudio.transforms"],titleterms:{"function":[0,2],amplitude_to_db:2,amplitudetodb:6,angl:2,complex_norm:2,complianc:0,create_dct:2,create_fb_matrix:2,dataset:1,fbank:0,istft:2,kaldi:0,kaldi_io:4,magphas:2,matric:4,melscal:6,melspectrogram:6,mfcc:6,mu_law_decod:2,mu_law_encod:2,mulawdecod:6,mulawencod:6,phase_vocod:2,read_mat_ark:4,read_mat_scp:4,read_vec_flt_ark:4,read_vec_flt_scp:4,read_vec_int_ark:4,resampl:6,resample_waveform:0,sox_effect:5,soxeffect:5,soxeffectschain:5,spectrogram:[0,2,6],torchaudio:[0,1,2,3,4,5,6],transform:6,vctk:1,vector:4,yesno:1}}) \ No newline at end of file +Search.setIndex({docnames:["compliance.kaldi","datasets","functional","index","kaldi_io","sox_effects","transforms"],envversion:{"sphinx.domains.c":1,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":1,"sphinx.domains.javascript":1,"sphinx.domains.math":2,"sphinx.domains.python":1,"sphinx.domains.rst":1,"sphinx.domains.std":1,"sphinx.ext.intersphinx":1,"sphinx.ext.todo":1,"sphinx.ext.viewcode":1,sphinx:56},filenames:["compliance.kaldi.rst","datasets.rst","functional.rst","index.rst","kaldi_io.rst","sox_effects.rst","transforms.rst"],objects:{"":{torchaudio:[3,0,0,"-"]},"torchaudio._docs.AmplitudeToDB":{forward:[6,1,1,""]},"torchaudio._docs.MFCC":{forward:[6,1,1,""]},"torchaudio._docs.MelScale":{forward:[6,1,1,""]},"torchaudio._docs.MelSpectrogram":{forward:[6,1,1,""]},"torchaudio._docs.MuLawDecoding":{forward:[6,1,1,""]},"torchaudio._docs.MuLawEncoding":{forward:[6,1,1,""]},"torchaudio._docs.Resample":{forward:[6,1,1,""]},"torchaudio._docs.Spectrogram":{forward:[6,1,1,""]},"torchaudio.compliance.kaldi":{fbank:[0,2,1,""],mfcc:[0,2,1,""],resample_waveform:[0,2,1,""],spectrogram:[0,2,1,""]},"torchaudio.datasets":{VCTK:[1,3,1,""],YESNO:[1,3,1,""]},"torchaudio.datasets.VCTK":{__getitem__:[1,1,1,""]},"torchaudio.datasets.YESNO":{__getitem__:[1,1,1,""]},"torchaudio.functional":{amplitude_to_DB:[2,2,1,""],angle:[2,2,1,""],complex_norm:[2,2,1,""],create_dct:[2,2,1,""],create_fb_matrix:[2,2,1,""],istft:[2,2,1,""],magphase:[2,2,1,""],mu_law_decoding:[2,2,1,""],mu_law_encoding:[2,2,1,""],phase_vocoder:[2,2,1,""],spectrogram:[2,2,1,""]},"torchaudio.kaldi_io":{read_mat_ark:[4,2,1,""],read_mat_scp:[4,2,1,""],read_vec_flt_ark:[4,2,1,""],read_vec_flt_scp:[4,2,1,""],read_vec_int_ark:[4,2,1,""]},"torchaudio.sox_effects":{SoxEffect:[5,3,1,""],SoxEffectsChain:[5,3,1,""]},"torchaudio.sox_effects.SoxEffectsChain":{append_effect_to_chain:[5,1,1,""],clear_chain:[5,1,1,""],set_input_file:[5,1,1,""],sox_build_flow_effects:[5,1,1,""]},"torchaudio.transforms":{AmplitudeToDB:[6,3,1,""],MFCC:[6,3,1,""],MelScale:[6,3,1,""],MelSpectrogram:[6,3,1,""],MuLawDecoding:[6,3,1,""],MuLawEncoding:[6,3,1,""],Resample:[6,3,1,""],Spectrogram:[6,3,1,""]},torchaudio:{get_sox_bool:[3,2,1,""],get_sox_encoding_t:[3,2,1,""],get_sox_option_t:[3,2,1,""],info:[3,2,1,""],initialize_sox:[3,2,1,""],load:[3,2,1,""],load_wav:[3,2,1,""],save:[3,2,1,""],save_encinfo:[3,2,1,""],shutdown_sox:[3,2,1,""],sox_encodinginfo_t:[3,2,1,""],sox_signalinfo_t:[3,2,1,""]}},objnames:{"0":["py","module","Python module"],"1":["py","method","Python method"],"2":["py","function","Python function"],"3":["py","class","Python class"]},objtypes:{"0":"py:module","1":"py:method","2":"py:function","3":"py:class"},terms:{"16000hz":5,"179d6e9a88202ab0a2f":6,"boolean":[3,5],"byte":3,"class":[1,5,6],"default":[0,1,2,3,5,6],"enum":3,"final":3,"float":[0,2,3,6],"function":[1,3,5,6],"import":2,"int":[0,1,2,3,4,5,6],"new":0,"return":[0,1,2,3,4,5,6],"short":2,"true":[0,1,2,3,5,6],CMS:0,For:[1,2,3,6],Not:3,The:[0,1,2,3,4,5,6],These:2,Use:2,Useful:1,__getitem__:[1,5],__init__:5,__len__:[1,5],__members__:3,_get_strid:0,_length:2,_modul:6,abs:3,absolut:0,accord:4,add:0,addit:2,advanc:2,after:[2,3,6],again:1,aggress:2,algorithm:[2,6],all:[1,2,3],almost:1,alreadi:1,altern:1,amin:2,amount:2,amplitud:[2,6],amplitude_to_db:3,amplitudetodb:3,angl:3,anoth:6,api:1,append:5,append_effect_to_chain:5,appli:[0,2,6],applic:0,apr:2,arg:1,argument:[1,6],ark:4,around:[0,4],asr:0,assp:2,assum:[2,3,5,6],attempt:3,attribut:5,audio:[0,1,2,3,5,6],audiodir_path:5,automat:[3,5],avail:1,bandlimit:0,bank:[2,6],base:[2,6],batch:2,batch_siz:1,becaus:2,been:[2,6],befor:[0,2,6],begin:3,being:[2,6],between:[2,5,6],bin:[0,2,6],bit:[3,5],bits_per_sampl:3,blackman:0,blackman_coeff:0,blob:0,bool:[0,1,2,3,5,6],both:2,build:5,built:6,calcul:[0,2,6],call:3,callabl:[1,3,5,6],can:[0,1,2,3,6],cancel:2,cannot:[2,3,5],caution:0,ccrma:0,ceil:2,center:2,cepstra:0,cepstral_lift:0,cepstrum:6,certain:2,chain:[3,5,6],chang:[0,3],channel:[0,2,3,5,6],channels_first:[3,5],check:2,choos:3,clamp:2,clean:1,clear:5,clear_chain:5,clip:[2,6],coeffici:[0,2,6],column:2,com:[0,6],common:[1,2,3,6],compand:[2,6],compat:0,complet:0,complex:2,complex_norm:3,complex_specgram:2,complex_specgrams_stretch:2,complex_tensor:2,complianc:3,compon:0,composit:6,compress:3,comput:[0,2],condit:2,consider:2,consist:[3,6],constant:[0,2],control:[0,2,6],conveni:3,convers:[2,6],convert:[2,6],core:6,cosin:6,could:[2,3,5],creat:[0,2,3,4,5,6],create_dct:3,create_fb_matrix:3,cut:[2,6],cutoff:0,data:[0,1,2,3,5],data_load:1,data_vol_norm:3,dataload:1,dataset:[3,5],db_multipli:2,dct:[2,6],dct_type:6,decibel:[2,6],decod:[2,6],def:5,depend:[0,2,4,6],descriptor:4,desir:[0,6],determin:[3,5],dev_mod:1,devic:6,dict:[3,6],dictionari:4,differ:[2,6],dimens:[0,2,6],directori:1,discard:2,discret:6,disk:3,distanc:2,dither:0,divid:[3,5],down:2,download:1,downsampl:[0,1],durat:0,each:[0,2,3,6],earg:5,edu:0,effect:[0,3,5],effici:0,either:2,element:[0,2],elementwis:6,els:0,enam:5,encod:[2,3,6],encodinginfo:3,end:[0,2],endian:3,energi:[0,2,6],energy_floor:0,entri:[2,6],envelop:2,eopt:5,epsilon:0,estim:2,etc:[2,3,6],everyth:3,exactli:2,exampl:[1,2,3,4,5,6],exist:1,expect:[0,2,6],expon:[2,6],extens:[3,5],extra:0,extract:0,f_max:[2,6],f_min:[2,6],factor:[0,2,3],fals:[0,1,2,6],fault:3,feat:0,featur:0,fft:[0,2,6],fft_size:2,file:[0,1,3,4,5],file_or_fd:4,filedescriptor:4,filepath:3,filetyp:[3,5],filter:[0,2,6],filterbank:[0,2,6],finish:3,first:[3,5,6],fit:[0,2],fix:0,float32:4,float64:4,floor:0,flow:5,follow:[1,5],foo:3,format:3,forward:6,fourier:[2,6],frame:[0,2,3,5,6],frame_length:0,frame_shift:0,freq:[0,2,6],frequenc:[0,2,6],from:[0,1,2,3,4,5,6],full:[2,6],gener:[0,4],get:[0,3],get_sox_bool:3,get_sox_encoding_t:3,get_sox_option_t:3,gist:6,github:[0,6],give:6,given:[0,2,3,5,6],griffin:2,gzip:4,ham:0,han:0,handl:0,hann_window:6,has:[0,2,6],have:1,haythamfayek:6,headroom:3,hebrew:1,help:[3,5],henc:1,here:[5,6],high:0,high_freq:0,highlight:2,hop:[2,6],hop_length:[2,6],htk:0,htk_compat:0,html:[0,6],http:[0,6],ident:0,ieee:2,imag:1,implement:[1,6],importantli:3,includ:0,index:[1,5],individu:0,inflect:0,info:[2,3,6],inform:[2,5],infti:2,initi:3,initialize_sox:[3,5],input:[0,1,2,3,5,6],input_fil:5,instal:4,instead:[3,6],integ:[3,5],internet:1,interpol:0,interv:0,invers:[2,6],isn:2,istft:3,its:2,join:5,jos:0,kaldi:[3,4],kaldi_io:3,kastnerkyl:6,keep:1,kei:4,kwarg:3,l56:0,lambda:3,last:[0,2],law:[2,6],learn:6,least:2,left:[0,2],len:5,length:[0,2,3,5,6],less:0,librosa:6,light:4,like:3,lim:2,linear:0,linearli:0,linearresampl:0,linspac:2,list:[3,5],listdir:5,load:[1,3,6],load_wav:3,log10:2,log:[0,6],log_mel:6,loss:2,lossi:3,low:0,low_freq:0,lowpass_filter_width:0,machin:6,magnitud:[0,2,6],magphas:3,mai:[2,6],master:0,match:0,math:2,matric:3,matrix:[2,4,6],max:[2,3],maximum:[2,6],mean:[0,2],mel:[0,2,6],mel_specgram:6,melkwarg:6,melscal:3,melspectrogram:3,metadata:[3,5],method:[1,2,6],mfc:[2,6],mfcc:3,millisecond:0,min_dur:0,minimum:[0,2,6],modifi:2,modul:4,mono:[0,5],more:[0,2,6],mp3:3,mu_law_decod:3,mu_law_encod:3,mulawdecod:3,mulawencod:3,mult:3,multipl:1,multipli:[2,3,6],multiprocess:1,must:[0,2,6],mydataset:5,n_fft:[2,6],n_frame:[2,6],n_freq:2,n_mel:[2,6],n_mfcc:[2,6],n_stft:6,name:5,need:[0,3,4,6],neg:[0,2,6],neighbor:2,never:2,new_freq:[0,6],nibbl:3,nola:2,none:[1,2,3,5,6],nonzero:2,norm:[2,6],normal:[0,2,3,5,6],note:3,nthread:1,num_cep:0,num_fram:3,num_freq:2,num_mel_bin:0,num_work:1,number:[0,2,3,5,6],numer:6,numeric_limit:0,nyquist:0,object:[3,5,6],occur:2,off:[0,2,6],offlinefeaturetpl:0,offset:[0,3],onc:3,one:[3,6],ones:2,onesid:2,onli:[0,3],open:4,oper:[0,2],opposite_endian:3,option:[0,1,2,3,5,6],orig_freq:[0,6],origin:[0,2,6],ortho:[2,6],other:0,out:[3,5],out_encinfo:5,out_siginfo:5,output:[0,1,2,3,5,6],overlap:2,packag:3,pad:[0,2,6],pad_mod:2,padded_window_s:0,parallelli:1,paramet:[0,1,2,3,4,5,6],pass:[1,3,5],path:[3,5],path_to_audio_fil:5,pathlib:3,per:3,perform:[0,1,2],phase:2,phase_adv:2,phase_vocod:3,piecewis:0,pil:1,pipe:4,pitch:2,point:[0,2],popular:3,possibl:3,povei:0,power:[0,2,6],practic:3,precis:3,preemphasi:0,preemphasis_coeffici:0,preprocess:5,prevent:2,primarili:3,print:3,process:[0,1,6],produc:0,provid:2,put:[0,1],python:[3,5,6],quantization_channel:[2,6],randn:2,rang:0,rate:[2,3,5,6],rather:3,raw:[0,1,2,5,6],raw_energi:0,read:4,reason:[2,6],recommend:0,rectangular:0,refer:[2,3],reflect:[0,2],rel:0,remov:2,remove_dc_offset:0,repres:0,requir:3,resampl:[0,3,5],resamplewaveform:0,resampling_method:6,respect:1,result:[2,3,5],retain:[2,6],revers:3,reverse_bit:3,reverse_byt:3,reverse_nibbl:3,right:[0,2,3],root:1,round:0,round_to_power_of_two:0,row:2,run:3,same:[2,3],sampl:[0,1,3,5,6],sample_frequ:0,sample_r:[3,6],save:3,save_encinfo:3,scale:[0,2,6],scp:4,second:0,see:[2,3,6],seg:3,segment:0,self:5,separ:2,sequenti:6,set:[0,3,5],set_input_fil:5,shape:[0,2,3],sharp:0,sharper:0,shift:[0,3],shorter:2,should:[0,2],showdown:3,shuffl:1,shutdown:3,shutdown_sox:[3,5],side:[2,6],sig:5,sign:[3,5],signal:[0,1,2,3,5,6],signal_length:2,signalinfo:3,similar:[0,1],simpl:3,sinc:[0,2],sinc_interpol:6,size:[0,2,3,5,6],slide:2,slow:2,snip_edg:0,snippet:[2,6],some:2,someth:5,sourc:[0,1,2,3,4,5,6],sox:[3,5],sox_bool:3,sox_build_flow_effect:5,sox_effect:3,sox_encoding_t:3,sox_encodinginfo_t:[3,5],sox_fals:3,sox_option_default:3,sox_option_t:3,sox_signalinfo_t:[3,5],soxeffect:3,soxeffectschain:3,space:0,spec_f:6,specgram:6,specgram_mel_db:6,specif:[2,3],specifi:[0,3],spectrogram:[1,3],spectrum:6,speech:6,speed:2,split:[2,6],squar:[2,6],src:[0,3],stabl:6,standard:3,stanford:0,start:3,std:0,stft:[2,6],stft_matrix:2,str:[0,1,2,3,4,5,6],stream:4,string:4,stype:6,subclass:1,subtract:0,subtract_mean:0,suffici:0,suggest:0,sum_:2,summat:2,suppos:2,take:1,target:1,target_transform:1,tensor:[0,1,2,3,4,5,6],test:[1,6],text:2,textbook:6,than:2,thei:[1,2,6],theory_ideal_bandlimited_interpol:0,thi:[0,2,3,4,6],time:[2,6],timsainb:6,togeth:6,top_db:[2,6],torch:[0,1,2,3,4,5,6],total:0,train:1,tran:2,transcript:1,transform:[1,2,3],triangular:[0,2,6],trim:2,tupl:[1,2,3,4,5],turn:[0,2,6],two:[0,1,2,6],type:[0,1,2,3,4,5,6],ulaw:3,unchang:[2,6],uniqu:3,unknown:3,unspecifi:3,upsampl:0,url:1,use:[0,2,3,4,6],use_energi:0,use_log_fbank:0,use_pow:0,used:[2,3],useful:[0,2],user:6,uses:[0,3,6],using:[1,2,6],util:1,valu:[2,3,6],variou:0,vctk:3,vector:3,version:1,vol:2,vtln:0,vtln_high:0,vtln_low:0,vtln_map:0,vtln_warp:0,wai:0,warn:0,warp:0,wav:[3,6],wave:3,waveform:[0,2,6],what:0,when:[2,3],where:[0,1,2,3,5,6],whether:[1,2,6],which:[0,1,2,3,4,5,6],whole:2,wikipedia:[2,6],win_length:[2,6],window:[0,2,6],window_fn:6,window_typ:0,wise:2,without:[2,3],wkwarg:6,worker:1,would:[0,2],wrapper:4,written:5,x_mu:[2,6],yesno:3,yesno_data:1,you:[0,3],zero:[0,2],zeroth:0},titles:["torchaudio.compliance.kaldi","torchaudio.datasets","torchaudio.functional","torchaudio","torchaudio.kaldi_io","torchaudio.sox_effects","torchaudio.transforms"],titleterms:{"function":[0,2],amplitude_to_db:2,amplitudetodb:6,angl:2,complex_norm:2,complianc:0,create_dct:2,create_fb_matrix:2,dataset:1,fbank:0,istft:2,kaldi:0,kaldi_io:4,magphas:2,matric:4,melscal:6,melspectrogram:6,mfcc:[0,6],mu_law_decod:2,mu_law_encod:2,mulawdecod:6,mulawencod:6,phase_vocod:2,read_mat_ark:4,read_mat_scp:4,read_vec_flt_ark:4,read_vec_flt_scp:4,read_vec_int_ark:4,resampl:6,resample_waveform:0,sox_effect:5,soxeffect:5,soxeffectschain:5,spectrogram:[0,2,6],torchaudio:[0,1,2,3,4,5,6],transform:6,vctk:1,vector:4,yesno:1}}) \ No newline at end of file diff --git a/sox_effects.html b/sox_effects.html index d5647bb1d2..31e2eecddd 100644 --- a/sox_effects.html +++ b/sox_effects.html @@ -265,7 +265,7 @@

      SoxEffectsChain
      Return type
      -

      Tuple[torch.Tensor, int]

      +

      Tuple[torch.Tensor, int]

      @@ -330,7 +330,7 @@

      SoxEffectsChain

      Build effects chain and flow effects from input file to output tensor

      Parameters
      -

      out (torch.Tensor) – Where the output will be written to. (Default: None)

      +

      out (torch.Tensor) – Where the output will be written to. (Default: None)

      Returns

      An output Tensor of size [C x L] or [L x C] where L is the number @@ -338,7 +338,7 @@

      SoxEffectsChain

      Return type
      -

      Tuple[torch.Tensor, int]

      +

      Tuple[torch.Tensor, int]

      diff --git a/transforms.html b/transforms.html index 1719f81ae5..1f8d1a84da 100644 --- a/transforms.html +++ b/transforms.html @@ -218,7 +218,7 @@

      torchaudio.transforms

      -

      Transforms are common audio transforms. They can be chained together using torch.nn.Sequential

      +

      Transforms are common audio transforms. They can be chained together using torch.nn.Sequential

      Spectrogram

      @@ -233,7 +233,7 @@

      Spectrogramint, optional) – Length of hop between STFT windows. ( Default: win_length // 2)

    • pad (int) – Two sided padding of signal. (Default: 0)

    • -
    • window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor +

    • window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor that is applied/multiplied to each frame/window. (Default: torch.hann_window)

    • power (int) – Exponent for the magnitude spectrogram, (must be > 0) e.g., 1 for energy, 2 for power, etc. (Default: 2)

    • @@ -247,7 +247,7 @@

      Spectrogramforward(waveform)[source]
      Parameters
      -

      waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)

      +

      waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)

      Returns

      Dimension (channel, freq, time), where channel @@ -255,7 +255,7 @@

      SpectrogramReturn type -

      torch.Tensor

      +

      torch.Tensor

      @@ -289,13 +289,13 @@

      AmplitudeToDBhttps://librosa.github.io/librosa/_modules/librosa/core/spectrum.html

      Parameters
      -

      x (torch.Tensor) – Input tensor before being converted to decibel scale

      +

      x (torch.Tensor) – Input tensor before being converted to decibel scale

      Returns

      Output tensor in decibel scale

      Return type
      -

      torch.Tensor

      +

      torch.Tensor

      @@ -328,13 +328,13 @@

      MelScaleforward(specgram)[source]
      Parameters
      -

      specgram (torch.Tensor) – A spectrogram STFT of dimension (channel, freq, time)

      +

      specgram (torch.Tensor) – A spectrogram STFT of dimension (channel, freq, time)

      Returns

      Mel frequency spectrogram of size (channel, n_mels, time)

      Return type
      -

      torch.Tensor

      +

      torch.Tensor

      @@ -369,7 +369,7 @@

      MelSpectrogram

      f_max (float, optional) – Maximum frequency. (Default: None)

    • pad (int) – Two sided padding of signal. (Default: 0)

    • n_mels (int) – Number of mel filterbanks. (Default: 128)

    • -
    • window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor +

    • window_fn (Callable[[..], torch.Tensor]) – A function to create a window tensor that is applied/multiplied to each frame/window. (Default: torch.hann_window)

    • wkwargs (Dict[.., ..]) – Arguments for window function. (Default: None)

    @@ -387,13 +387,13 @@

    MelSpectrogramforward(waveform)[source]
    Parameters
    -

    waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)

    +

    waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)

    Returns

    Mel frequency spectrogram of size (channel, n_mels, time)

    Return type
    -

    torch.Tensor

    +

    torch.Tensor

    @@ -431,13 +431,13 @@

    MFCCforward(waveform)[source]
    Parameters
    -

    waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)

    +

    waveform (torch.Tensor) – Tensor of audio of dimension (channel, time)

    Returns

    specgram_mel_db of size (channel, n_mfcc, time)

    Return type
    -

    torch.Tensor

    +

    torch.Tensor

    @@ -464,13 +464,13 @@

    MuLawEncodingforward(x)[source]
    Parameters
    -

    x (torch.Tensor) – A signal to be encoded

    +

    x (torch.Tensor) – A signal to be encoded

    Returns

    An encoded signal

    Return type
    -

    x_mu (torch.Tensor)

    +

    x_mu (torch.Tensor)

    @@ -497,13 +497,13 @@

    MuLawDecodingforward(x_mu)[source]
    Parameters
    -

    x_mu (torch.Tensor) – A mu-law encoded signal which needs to be decoded

    +

    x_mu (torch.Tensor) – A mu-law encoded signal which needs to be decoded

    Returns

    The signal decoded

    Return type
    -

    torch.Tensor

    +

    torch.Tensor

    @@ -532,13 +532,13 @@

    Resampleforward(waveform)[source]
    Parameters
    -

    waveform (torch.Tensor) – The input signal of dimension (channel, time)

    +

    waveform (torch.Tensor) – The input signal of dimension (channel, time)

    Returns

    Output signal of dimension (channel, time)

    Return type
    -

    torch.Tensor

    +

    torch.Tensor