Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/source/pipelines.rst
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ WAV2VEC2_ASR_LARGE_LV60K_960H
.. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
:no-value:

VOXPOPULI_ASR_BASE_10K_FR
~~~~~~~~~~~~~~~~~~~~~~~~~

.. container:: py attribute

.. autodata:: VOXPOPULI_ASR_BASE_10K_FR
:no-value:

HUBERT_ASR_LARGE
~~~~~~~~~~~~~~~~

Expand Down
22 changes: 22 additions & 0 deletions docs/source/refs.bib
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
@article{voxpopuli,
author = {Changhan Wang and
Morgane Rivi{\`{e}}re and
Ann Lee and
Anne Wu and
Chaitanya Talnikar and
Daniel Haziza and
Mary Williamson and
Juan Miguel Pino and
Emmanuel Dupoux},
title = {VoxPopuli: {A} Large-Scale Multilingual Speech Corpus for Representation
Learning, Semi-Supervised Learning and Interpretation},
journal = {CoRR},
volume = {abs/2101.00390},
year = {2021},
url = {https://arxiv.org/abs/2101.00390},
eprinttype = {arXiv},
eprint = {2101.00390},
timestamp = {Thu, 12 Aug 2021 15:37:06 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2101-00390.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{specaugment,
title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition},
url={http://dx.doi.org/10.21437/Interspeech.2019-2680},
Expand Down
1 change: 1 addition & 0 deletions test/integration_tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def ctc_decoder():

_FILES = {
'en': 'Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac',
'fr': '20121212-0900-PLENARY-5-fr_20121212-11_37_04_10.flac',
}


Expand Down
2 changes: 2 additions & 0 deletions test/integration_tests/wav2vec2_pipeline_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
HUBERT_XLARGE,
HUBERT_ASR_LARGE,
HUBERT_ASR_XLARGE,
VOXPOPULI_ASR_BASE_10K_FR,
)
import pytest

Expand Down Expand Up @@ -53,6 +54,7 @@ def test_pretraining_models(bundle):
(WAV2VEC2_ASR_LARGE_LV60K_960H, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(HUBERT_ASR_LARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(HUBERT_ASR_XLARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
(VOXPOPULI_ASR_BASE_10K_FR, 'fr', 'la|commission|va|faire|des|propositions|sur|ce|sujet|comment|mettre|en|place|cette|capacité|fiscale|et|le|conseil|européen|y|reviendra|sour|les|sujets|au|moins|de|mars'), # noqa: E501
]
)
def test_finetune_asr_model(
Expand Down
2 changes: 2 additions & 0 deletions torchaudio/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
WAV2VEC2_ASR_LARGE_LV60K_100H,
WAV2VEC2_ASR_LARGE_LV60K_960H,
WAV2VEC2_XLSR53,
VOXPOPULI_ASR_BASE_10K_FR,
HUBERT_BASE,
HUBERT_LARGE,
HUBERT_XLARGE,
Expand Down Expand Up @@ -44,6 +45,7 @@
'WAV2VEC2_ASR_LARGE_LV60K_100H',
'WAV2VEC2_ASR_LARGE_LV60K_960H',
'WAV2VEC2_XLSR53',
'VOXPOPULI_ASR_BASE_10K_FR',
'HUBERT_BASE',
'HUBERT_LARGE',
'HUBERT_XLARGE',
Expand Down
46 changes: 46 additions & 0 deletions torchaudio/pipelines/_wav2vec2/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,3 +969,49 @@ def get_labels(

Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
""" # noqa: E501


VOXPOPULI_ASR_BASE_10K_FR = Wav2Vec2ASRBundle(
'wav2vec2_voxpopuli_base_10k_asr_fr.pt',
{
"extractor_mode": "group_norm",
"extractor_conv_layer_config": [
(512, 10, 5),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 3, 2),
(512, 2, 2),
(512, 2, 2),
],
"extractor_conv_bias": False,
"encoder_embed_dim": 768,
"encoder_projection_dropout": 0.0,
"encoder_pos_conv_kernel": 128,
"encoder_pos_conv_groups": 16,
"encoder_num_layers": 12,
"encoder_num_heads": 12,
"encoder_attention_dropout": 0.0,
"encoder_ff_interm_features": 3072,
"encoder_ff_interm_dropout": 0.1,
"encoder_dropout": 0.0,
"encoder_layer_norm_first": False,
"encoder_layer_drop": 0.1,
"aux_num_out": 43
},
_labels=utils._get_fr_labels(),
_sample_rate=16000,
)
VOXPOPULI_ASR_BASE_10K_FR.__doc__ = """wav2vec 2.0 model with "Base" configuration.

Pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset [:footcite:`voxpopuli`]
("10k" subset, consisting of 23 languages).
Fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.

Originally published by the authors of *VoxPopuli* [:footcite:`voxpopuli`] under CC BY-NC 4.0 and
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
""" # noqa: E501
47 changes: 47 additions & 0 deletions torchaudio/pipelines/_wav2vec2/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,50 @@ def _get_en_labels():
'Q',
'Z',
)


def _get_fr_labels():
return (
"|",
"e",
"s",
"n",
"i",
"t",
"r",
"a",
"o",
"u",
"l",
"d",
"c",
"p",
"m",
"é",
"v",
"q",
"f",
"g",
"b",
"h",
"x",
"à",
"j",
"è",
"y",
"ê",
"z",
"ô",
"k",
"ç",
"œ",
"û",
"ù",
"î",
"â",
"w",
"ï",
"ë",
"ü",
"æ",
)