pytorch · mthrok · Oct 25, 2021 · Oct 7, 2021
@@ -153,6 +153,14 @@ WAV2VEC2_ASR_LARGE_LV60K_960H
    .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
       :no-value:
 
+VOXPOPULI_ASR_BASE_10K_FR
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. container:: py attribute
+
+   .. autodata:: VOXPOPULI_ASR_BASE_10K_FR
+      :no-value:
+
 HUBERT_ASR_LARGE
 ~~~~~~~~~~~~~~~~
 

@@ -1,3 +1,25 @@
+@article{voxpopuli,
+  author    = {Changhan Wang and
+               Morgane Rivi{\`{e}}re and
+               Ann Lee and
+               Anne Wu and
+               Chaitanya Talnikar and
+               Daniel Haziza and
+               Mary Williamson and
+               Juan Miguel Pino and
+               Emmanuel Dupoux},
+  title     = {VoxPopuli: {A} Large-Scale Multilingual Speech Corpus for Representation
+               Learning, Semi-Supervised Learning and Interpretation},
+  journal   = {CoRR},
+  volume    = {abs/2101.00390},
+  year      = {2021},
+  url       = {https://arxiv.org/abs/2101.00390},
+  eprinttype = {arXiv},
+  eprint    = {2101.00390},
+  timestamp = {Thu, 12 Aug 2021 15:37:06 +0200},
+  biburl    = {https://dblp.org/rec/journals/corr/abs-2101-00390.bib},
+  bibsource = {dblp computer science bibliography, https://dblp.org}
+}
 @article{specaugment,
    title={SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition},
    url={http://dx.doi.org/10.21437/Interspeech.2019-2680},

@@ -34,6 +34,7 @@ def ctc_decoder():
 
 _FILES = {
     'en': 'Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac',
+    'fr': '20121212-0900-PLENARY-5-fr_20121212-11_37_04_10.flac',
 }
 
 

@@ -18,6 +18,7 @@
     HUBERT_XLARGE,
     HUBERT_ASR_LARGE,
     HUBERT_ASR_XLARGE,
+    VOXPOPULI_ASR_BASE_10K_FR,
 )
 import pytest
 
@@ -53,6 +54,7 @@ def test_pretraining_models(bundle):
         (WAV2VEC2_ASR_LARGE_LV60K_960H, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
         (HUBERT_ASR_LARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
         (HUBERT_ASR_XLARGE, 'en', 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (VOXPOPULI_ASR_BASE_10K_FR, 'fr', 'la|commission|va|faire|des|propositions|sur|ce|sujet|comment|mettre|en|place|cette|capacité|fiscale|et|le|conseil|européen|y|reviendra|sour|les|sujets|au|moins|de|mars'),  # noqa: E501
     ]
 )
 def test_finetune_asr_model(

diff --git a/torchaudio/pipelines/__init__.py b/torchaudio/pipelines/__init__.py
@@ -14,6 +14,7 @@
     WAV2VEC2_ASR_LARGE_LV60K_100H,
     WAV2VEC2_ASR_LARGE_LV60K_960H,
     WAV2VEC2_XLSR53,
+    VOXPOPULI_ASR_BASE_10K_FR,
     HUBERT_BASE,
     HUBERT_LARGE,
     HUBERT_XLARGE,
@@ -44,6 +45,7 @@
     'WAV2VEC2_ASR_LARGE_LV60K_100H',
     'WAV2VEC2_ASR_LARGE_LV60K_960H',
     'WAV2VEC2_XLSR53',
+    'VOXPOPULI_ASR_BASE_10K_FR',
     'HUBERT_BASE',
     'HUBERT_LARGE',
     'HUBERT_XLARGE',

diff --git a/torchaudio/pipelines/_wav2vec2/impl.py b/torchaudio/pipelines/_wav2vec2/impl.py
@@ -969,3 +969,49 @@ def get_labels(
 
 Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
 """  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_FR = Wav2Vec2ASRBundle(
+    'wav2vec2_voxpopuli_base_10k_asr_fr.pt',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 43
+    },
+    _labels=utils._get_fr_labels(),
+    _sample_rate=16000,
+)
+VOXPOPULI_ASR_BASE_10K_FR.__doc__ = """wav2vec 2.0 model with "Base" configuration.
+
+Pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset [:footcite:`voxpopuli`]
+("10k" subset, consisting of 23 languages).
+Fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.
+
+Originally published by the authors of *VoxPopuli* [:footcite:`voxpopuli`] under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :func:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
diff --git a/torchaudio/pipelines/_wav2vec2/utils.py b/torchaudio/pipelines/_wav2vec2/utils.py
@@ -29,3 +29,50 @@ def _get_en_labels():
         'Q',
         'Z',
     )
+
+
+def _get_fr_labels():
+    return (
+        "|",
+        "e",
+        "s",
+        "n",
+        "i",
+        "t",
+        "r",
+        "a",
+        "o",
+        "u",
+        "l",
+        "d",
+        "c",
+        "p",
+        "m",
+        "é",
+        "v",
+        "q",
+        "f",
+        "g",
+        "b",
+        "h",
+        "x",
+        "à",
+        "j",
+        "è",
+        "y",
+        "ê",
+        "z",
+        "ô",
+        "k",
+        "ç",
+        "œ",
+        "û",
+        "ù",
+        "î",
+        "â",
+        "w",
+        "ï",
+        "ë",
+        "ü",
+        "æ",
+    )
-Original file line number
+Diff line change
@@ Expand Up / @@ -34,6 +34,7 @@ def ctc_decoder(): @@
     _FILES = {
         'en': 'Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.flac',
+        'fr': '20121212-0900-PLENARY-5-fr_20121212-11_37_04_10.flac',
     }
@@ Expand Down @@