diff --git a/docs/source/models.rst b/docs/source/models.rst index b266b6548c..50b285841f 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -139,6 +139,45 @@ Pre-trained Models .. autoproperty:: labels + .. autodata:: WAV2VEC2_BASE + :no-value: + + .. autodata:: WAV2VEC2_ASR_BASE_10M + :no-value: + + .. autodata:: WAV2VEC2_ASR_BASE_100H + :no-value: + + .. autodata:: WAV2VEC2_ASR_BASE_960H + :no-value: + + .. autodata:: WAV2VEC2_LARGE + :no-value: + + .. autodata:: WAV2VEC2_ASR_LARGE_10M + :no-value: + + .. autodata:: WAV2VEC2_ASR_LARGE_100H + :no-value: + + .. autodata:: WAV2VEC2_ASR_LARGE_960H + :no-value: + + .. autodata:: WAV2VEC2_LARGE_LV60K + :no-value: + + .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_10M + :no-value: + + .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_100H + :no-value: + + .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H + :no-value: + + .. autodata:: WAV2VEC2_XLSR53 + :no-value: + .. autodata:: HUBERT_BASE :no-value: diff --git a/test/integration_tests/wav2vec2_model_test.py b/test/integration_tests/wav2vec2_model_test.py index d48d449baa..ca7c45a7aa 100644 --- a/test/integration_tests/wav2vec2_model_test.py +++ b/test/integration_tests/wav2vec2_model_test.py @@ -1,5 +1,18 @@ import torchaudio from torchaudio.models import ( + WAV2VEC2_BASE, + WAV2VEC2_LARGE, + WAV2VEC2_LARGE_LV60K, + WAV2VEC2_ASR_BASE_10M, + WAV2VEC2_ASR_BASE_100H, + WAV2VEC2_ASR_BASE_960H, + WAV2VEC2_ASR_LARGE_10M, + WAV2VEC2_ASR_LARGE_100H, + WAV2VEC2_ASR_LARGE_960H, + WAV2VEC2_ASR_LARGE_LV60K_10M, + WAV2VEC2_ASR_LARGE_LV60K_100H, + WAV2VEC2_ASR_LARGE_LV60K_960H, + WAV2VEC2_XLSR53, HUBERT_BASE, HUBERT_LARGE, HUBERT_XLARGE, @@ -12,6 +25,10 @@ @pytest.mark.parametrize( "bundle", [ + WAV2VEC2_BASE, + WAV2VEC2_LARGE, + WAV2VEC2_LARGE_LV60K, + WAV2VEC2_XLSR53, HUBERT_BASE, HUBERT_LARGE, HUBERT_XLARGE, @@ -25,6 +42,15 @@ def test_pretraining_models(bundle): @pytest.mark.parametrize( "bundle,expected", [ + (WAV2VEC2_ASR_BASE_10M, 'I|HAD|THAT|CURIYOSSITY|BESID|ME|AT|THIS|MOMENT|'), + (WAV2VEC2_ASR_BASE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), + (WAV2VEC2_ASR_BASE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), + (WAV2VEC2_ASR_LARGE_10M, 'I|HAD|THAT|CURIOUSITY|BESIDE|ME|AT|THIS|MOMENT|'), + (WAV2VEC2_ASR_LARGE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), + (WAV2VEC2_ASR_LARGE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), + (WAV2VEC2_ASR_LARGE_LV60K_10M, 'I|HAD|THAT|CURIOUSSITY|BESID|ME|AT|THISS|MOMENT|'), + (WAV2VEC2_ASR_LARGE_LV60K_100H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), + (WAV2VEC2_ASR_LARGE_LV60K_960H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), (HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'), (HUBERT_ASR_XLARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|') ] diff --git a/torchaudio/models/__init__.py b/torchaudio/models/__init__.py index 1dea2683e8..9f3e8605a7 100644 --- a/torchaudio/models/__init__.py +++ b/torchaudio/models/__init__.py @@ -19,6 +19,19 @@ ) from .wav2vec2.pretrained import ( Wav2Vec2PretrainedModelBundle, + WAV2VEC2_BASE, + WAV2VEC2_LARGE, + WAV2VEC2_LARGE_LV60K, + WAV2VEC2_ASR_BASE_10M, + WAV2VEC2_ASR_BASE_100H, + WAV2VEC2_ASR_BASE_960H, + WAV2VEC2_ASR_LARGE_10M, + WAV2VEC2_ASR_LARGE_100H, + WAV2VEC2_ASR_LARGE_960H, + WAV2VEC2_ASR_LARGE_LV60K_10M, + WAV2VEC2_ASR_LARGE_LV60K_100H, + WAV2VEC2_ASR_LARGE_LV60K_960H, + WAV2VEC2_XLSR53, HUBERT_BASE, HUBERT_LARGE, HUBERT_XLARGE, @@ -45,6 +58,19 @@ 'hubert_ft_large', 'hubert_ft_xlarge', 'Wav2Vec2PretrainedModelBundle', + 'WAV2VEC2_BASE', + 'WAV2VEC2_LARGE', + 'WAV2VEC2_LARGE_LV60K', + 'WAV2VEC2_ASR_BASE_10M', + 'WAV2VEC2_ASR_BASE_100H', + 'WAV2VEC2_ASR_BASE_960H', + 'WAV2VEC2_ASR_LARGE_10M', + 'WAV2VEC2_ASR_LARGE_100H', + 'WAV2VEC2_ASR_LARGE_960H', + 'WAV2VEC2_ASR_LARGE_LV60K_10M', + 'WAV2VEC2_ASR_LARGE_LV60K_100H', + 'WAV2VEC2_ASR_LARGE_LV60K_960H', + 'WAV2VEC2_XLSR53', 'HUBERT_BASE', 'HUBERT_LARGE', 'HUBERT_XLARGE', diff --git a/torchaudio/models/wav2vec2/pretrained.py b/torchaudio/models/wav2vec2/pretrained.py index 0fbbed20e2..8b6d44c012 100644 --- a/torchaudio/models/wav2vec2/pretrained.py +++ b/torchaudio/models/wav2vec2/pretrained.py @@ -123,6 +123,549 @@ def _get_labels(): ) +WAV2VEC2_BASE = Wav2Vec2PretrainedModelBundle( + _path='wav2vec2_fairseq_base_ls960.pth', + _params={ + 'extractor_mode': 'group_norm', + 'extractor_conv_layer_config': [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + 'extractor_conv_bias': False, + 'encoder_embed_dim': 768, + 'encoder_projection_dropout': 0.1, + 'encoder_pos_conv_kernel': 128, + 'encoder_pos_conv_groups': 16, + 'encoder_num_layers': 12, + 'encoder_num_heads': 12, + 'encoder_attention_dropout': 0.1, + 'encoder_ff_interm_features': 3072, + 'encoder_ff_interm_dropout': 0.0, + 'encoder_dropout': 0.1, + 'encoder_layer_norm_first': False, + 'encoder_layer_drop': 0.05, + "aux_num_out": None, + }, + _labels=None, +) +WAV2VEC2_BASE.__doc__ = """wav2vec 2.0 model with "Base" configuration. + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"). +Not fine-tuned. + +Originally published by the authors of *wav2vec 2.0* [:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_BASE_10M = Wav2Vec2PretrainedModelBundle( + _path='wav2vec2_fairseq_base_ls960_asr_ll10m.pth', + _params={ + 'extractor_mode': 'group_norm', + 'extractor_conv_layer_config': [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + 'extractor_conv_bias': False, + 'encoder_embed_dim': 768, + 'encoder_projection_dropout': 0.1, + 'encoder_pos_conv_kernel': 128, + 'encoder_pos_conv_groups': 16, + 'encoder_num_layers': 12, + 'encoder_num_heads': 12, + 'encoder_attention_dropout': 0.1, + 'encoder_ff_interm_features': 3072, + 'encoder_ff_interm_dropout': 0.0, + 'encoder_dropout': 0.1, + 'encoder_layer_norm_first': False, + 'encoder_layer_drop': 0.05, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_BASE_10M.__doc__ = """Build "base" wav2vec2 model with an extra linear module + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and +fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset +[:footcite:`librilight`] ("train-10min" subset). + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_BASE_100H = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_base_ls960_asr_ls100.pth', + { + 'extractor_mode': 'group_norm', + 'extractor_conv_layer_config': [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + 'extractor_conv_bias': False, + 'encoder_embed_dim': 768, + 'encoder_projection_dropout': 0.1, + 'encoder_pos_conv_kernel': 128, + 'encoder_pos_conv_groups': 16, + 'encoder_num_layers': 12, + 'encoder_num_heads': 12, + 'encoder_attention_dropout': 0.1, + 'encoder_ff_interm_features': 3072, + 'encoder_ff_interm_dropout': 0.0, + 'encoder_dropout': 0.1, + 'encoder_layer_norm_first': False, + 'encoder_layer_drop': 0.05, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) + +WAV2VEC2_ASR_BASE_100H.__doc__ = """Build "base" wav2vec2 model with an extra linear module + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and +fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset. + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_BASE_960H = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_base_ls960_asr_ls960.pth', + { + "extractor_mode": "group_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": False, + "encoder_embed_dim": 768, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 12, + "encoder_num_heads": 12, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 3072, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.1, + "encoder_layer_norm_first": False, + "encoder_layer_drop": 0.05, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_BASE_960H.__doc__ = """Build "base" wav2vec2 model with an extra linear module + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and +fine-tuned for ASR on the same audio with the corresponding transcripts. + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_LARGE = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_ls960.pth', + { + "extractor_mode": "group_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": False, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": False, + "encoder_layer_drop": 0.2, + "aux_num_out": None, + }, + _labels=None, +) +WAV2VEC2_LARGE.__doc__ = """Build "large" wav2vec2 model. + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"). +Not fine-tuned. + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_LARGE_10M = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_ls960_asr_ll10m.pth', + { + "extractor_mode": "group_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": False, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": False, + "encoder_layer_drop": 0.2, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_LARGE_10M.__doc__ = """Build "large" wav2vec2 model with an extra linear module + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and +fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset +[:footcite:`librilight`] ("train-10min" subset). + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_LARGE_100H = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_ls960_asr_ls100.pth', + { + "extractor_mode": "group_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": False, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": False, + "encoder_layer_drop": 0.2, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_LARGE_100H.__doc__ = """Build "large" wav2vec2 model with an extra linear module + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and +fine-tuned for ASR on 100 hours of transcribed audio from +the same dataset ("train-clean-100" subset). + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_LARGE_960H = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_ls960_asr_ls960.pth', + { + "extractor_mode": "group_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": False, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": False, + "encoder_layer_drop": 0.2, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_LARGE_960H.__doc__ = """Build "large" wav2vec2 model with an extra linear module + +Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and +fine-tuned for ASR on the same audio with the corresponding transcripts. + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_LARGE_LV60K = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_lv60k.pth', + { + "extractor_mode": "layer_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": True, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": True, + "encoder_layer_drop": 0.0, + "aux_num_out": None, + }, + _labels=None, +) +WAV2VEC2_LARGE_LV60K.__doc__ = """Build "large-lv60k" wav2vec2 model. + +Pre-trained on 60,000 hours of unlabeled audio from +*Libri-Light* dataset [:footcite:`librilight`]. +Not fine-tuned. + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_lv60k_asr_ll10m.pth', + { + "extractor_mode": "layer_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": True, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": True, + "encoder_layer_drop": 0.0, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module + +Pre-trained on 60,000 hours of unlabeled audio from +*Libri-Light* dataset [:footcite:`librilight`], and +fine-tuned for ASR on 10 minutes of transcribed audio from +the same dataset ("train-10min" subset). + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_lv60k_asr_ls100.pth', + { + "extractor_mode": "layer_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": True, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": True, + "encoder_layer_drop": 0.0, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module + +Pre-trained on 60,000 hours of unlabeled audio from +*Libri-Light* dataset [:footcite:`librilight`], and +fine-tuned for ASR on 100 hours of transcribed audio from +*LibriSpeech* dataset [:footcite:`7178964`] ("train-clean-100" subset). + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_lv60k_asr_ls960.pth', + { + "extractor_mode": "layer_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": True, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.1, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.1, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": True, + "encoder_layer_drop": 0.0, + "aux_num_out": 32, + }, + _labels=_get_labels(), +) +WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module + +Pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* +[:footcite:`librilight`] dataset, and +fine-tuned for ASR on 960 hours of transcribed audio from +*LibriSpeech* dataset [:footcite:`7178964`] +(the combination of "train-clean-100", "train-clean-360", and "train-other-500"). + +Originally published by the authors of *wav2vec 2.0* +[:footcite:`baevski2020wav2vec`]. +[`Source `__] +""" + +WAV2VEC2_XLSR53 = Wav2Vec2PretrainedModelBundle( + 'wav2vec2_fairseq_large_xlsr53.pth', + { + "extractor_mode": "layer_norm", + "extractor_conv_layer_config": [ + (512, 10, 5), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 3, 2), + (512, 2, 2), + (512, 2, 2), + ], + "extractor_conv_bias": True, + "encoder_embed_dim": 1024, + "encoder_projection_dropout": 0.0, + "encoder_pos_conv_kernel": 128, + "encoder_pos_conv_groups": 16, + "encoder_num_layers": 24, + "encoder_num_heads": 16, + "encoder_attention_dropout": 0.0, + "encoder_ff_interm_features": 4096, + "encoder_ff_interm_dropout": 0.0, + "encoder_dropout": 0.0, + "encoder_layer_norm_first": True, + "encoder_layer_drop": 0.0, + "aux_num_out": None, + }, + _labels=None, +) +WAV2VEC2_XLSR53.__doc__ = """wav2vec 2.0 model with "Base" configuration. + +Trained on 56,000 hours of unlabeled audio from multiple datasets ( +*Multilingual LibriSpeech* [:footcite:`Pratap_2020`], +*CommonVoice* [:footcite:`ardila2020common`] and +*BABEL* [:footcite:`Gales2014SpeechRA`]). +Not fine-tuned. + +Originally published by the authors of +*Unsupervised Cross-lingual Representation Learning for Speech Recognition* +[:footcite:`conneau2020unsupervised`]. +[`Source `__] +""" + HUBERT_BASE = Wav2Vec2PretrainedModelBundle( 'hubert_fairseq_base_ls960.pth', {