diff --git a/docs/source/models.rst b/docs/source/models.rst
index b266b6548c..50b285841f 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -139,6 +139,45 @@ Pre-trained Models
 
    .. autoproperty:: labels
 
+   .. autodata:: WAV2VEC2_BASE
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_BASE_10M
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_BASE_100H
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_BASE_960H
+      :no-value:
+
+   .. autodata:: WAV2VEC2_LARGE
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_10M
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_100H
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_960H
+      :no-value:
+
+   .. autodata:: WAV2VEC2_LARGE_LV60K
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_10M
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_100H
+      :no-value:
+
+   .. autodata:: WAV2VEC2_ASR_LARGE_LV60K_960H
+      :no-value:
+
+   .. autodata:: WAV2VEC2_XLSR53
+      :no-value:
+
    .. autodata:: HUBERT_BASE
       :no-value:
 
diff --git a/test/integration_tests/wav2vec2_model_test.py b/test/integration_tests/wav2vec2_model_test.py
index d48d449baa..ca7c45a7aa 100644
--- a/test/integration_tests/wav2vec2_model_test.py
+++ b/test/integration_tests/wav2vec2_model_test.py
@@ -1,5 +1,18 @@
 import torchaudio
 from torchaudio.models import (
+    WAV2VEC2_BASE,
+    WAV2VEC2_LARGE,
+    WAV2VEC2_LARGE_LV60K,
+    WAV2VEC2_ASR_BASE_10M,
+    WAV2VEC2_ASR_BASE_100H,
+    WAV2VEC2_ASR_BASE_960H,
+    WAV2VEC2_ASR_LARGE_10M,
+    WAV2VEC2_ASR_LARGE_100H,
+    WAV2VEC2_ASR_LARGE_960H,
+    WAV2VEC2_ASR_LARGE_LV60K_10M,
+    WAV2VEC2_ASR_LARGE_LV60K_100H,
+    WAV2VEC2_ASR_LARGE_LV60K_960H,
+    WAV2VEC2_XLSR53,
     HUBERT_BASE,
     HUBERT_LARGE,
     HUBERT_XLARGE,
@@ -12,6 +25,10 @@
 @pytest.mark.parametrize(
     "bundle",
     [
+        WAV2VEC2_BASE,
+        WAV2VEC2_LARGE,
+        WAV2VEC2_LARGE_LV60K,
+        WAV2VEC2_XLSR53,
         HUBERT_BASE,
         HUBERT_LARGE,
         HUBERT_XLARGE,
@@ -25,6 +42,15 @@ def test_pretraining_models(bundle):
 @pytest.mark.parametrize(
     "bundle,expected",
     [
+        (WAV2VEC2_ASR_BASE_10M, 'I|HAD|THAT|CURIYOSSITY|BESID|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_BASE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_BASE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_10M, 'I|HAD|THAT|CURIOUSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_100H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_960H, 'I|HAD|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_LV60K_10M, 'I|HAD|THAT|CURIOUSSITY|BESID|ME|AT|THISS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_LV60K_100H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
+        (WAV2VEC2_ASR_LARGE_LV60K_960H, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
         (HUBERT_ASR_LARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|'),
         (HUBERT_ASR_XLARGE, 'I|HAVE|THAT|CURIOSITY|BESIDE|ME|AT|THIS|MOMENT|')
     ]
diff --git a/torchaudio/models/__init__.py b/torchaudio/models/__init__.py
index 1dea2683e8..9f3e8605a7 100644
--- a/torchaudio/models/__init__.py
+++ b/torchaudio/models/__init__.py
@@ -19,6 +19,19 @@
 )
 from .wav2vec2.pretrained import (
     Wav2Vec2PretrainedModelBundle,
+    WAV2VEC2_BASE,
+    WAV2VEC2_LARGE,
+    WAV2VEC2_LARGE_LV60K,
+    WAV2VEC2_ASR_BASE_10M,
+    WAV2VEC2_ASR_BASE_100H,
+    WAV2VEC2_ASR_BASE_960H,
+    WAV2VEC2_ASR_LARGE_10M,
+    WAV2VEC2_ASR_LARGE_100H,
+    WAV2VEC2_ASR_LARGE_960H,
+    WAV2VEC2_ASR_LARGE_LV60K_10M,
+    WAV2VEC2_ASR_LARGE_LV60K_100H,
+    WAV2VEC2_ASR_LARGE_LV60K_960H,
+    WAV2VEC2_XLSR53,
     HUBERT_BASE,
     HUBERT_LARGE,
     HUBERT_XLARGE,
@@ -45,6 +58,19 @@
     'hubert_ft_large',
     'hubert_ft_xlarge',
     'Wav2Vec2PretrainedModelBundle',
+    'WAV2VEC2_BASE',
+    'WAV2VEC2_LARGE',
+    'WAV2VEC2_LARGE_LV60K',
+    'WAV2VEC2_ASR_BASE_10M',
+    'WAV2VEC2_ASR_BASE_100H',
+    'WAV2VEC2_ASR_BASE_960H',
+    'WAV2VEC2_ASR_LARGE_10M',
+    'WAV2VEC2_ASR_LARGE_100H',
+    'WAV2VEC2_ASR_LARGE_960H',
+    'WAV2VEC2_ASR_LARGE_LV60K_10M',
+    'WAV2VEC2_ASR_LARGE_LV60K_100H',
+    'WAV2VEC2_ASR_LARGE_LV60K_960H',
+    'WAV2VEC2_XLSR53',
     'HUBERT_BASE',
     'HUBERT_LARGE',
     'HUBERT_XLARGE',
diff --git a/torchaudio/models/wav2vec2/pretrained.py b/torchaudio/models/wav2vec2/pretrained.py
index 0fbbed20e2..8b6d44c012 100644
--- a/torchaudio/models/wav2vec2/pretrained.py
+++ b/torchaudio/models/wav2vec2/pretrained.py
@@ -123,6 +123,549 @@ def _get_labels():
     )
 
 
+WAV2VEC2_BASE = Wav2Vec2PretrainedModelBundle(
+    _path='wav2vec2_fairseq_base_ls960.pth',
+    _params={
+        'extractor_mode': 'group_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 768,
+        'encoder_projection_dropout': 0.1,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 12,
+        'encoder_num_heads': 12,
+        'encoder_attention_dropout': 0.1,
+        'encoder_ff_interm_features': 3072,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.1,
+        'encoder_layer_norm_first': False,
+        'encoder_layer_drop': 0.05,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_BASE.__doc__ = """wav2vec 2.0 model with "Base" configuration.
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+Not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0* [:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_BASE_10M = Wav2Vec2PretrainedModelBundle(
+    _path='wav2vec2_fairseq_base_ls960_asr_ll10m.pth',
+    _params={
+        'extractor_mode': 'group_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 768,
+        'encoder_projection_dropout': 0.1,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 12,
+        'encoder_num_heads': 12,
+        'encoder_attention_dropout': 0.1,
+        'encoder_ff_interm_features': 3072,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.1,
+        'encoder_layer_norm_first': False,
+        'encoder_layer_drop': 0.05,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_BASE_10M.__doc__ = """Build "base" wav2vec2 model with an extra linear module
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+[:footcite:`librilight`] ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_BASE_100H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_base_ls960_asr_ls100.pth',
+    {
+        'extractor_mode': 'group_norm',
+        'extractor_conv_layer_config': [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        'extractor_conv_bias': False,
+        'encoder_embed_dim': 768,
+        'encoder_projection_dropout': 0.1,
+        'encoder_pos_conv_kernel': 128,
+        'encoder_pos_conv_groups': 16,
+        'encoder_num_layers': 12,
+        'encoder_num_heads': 12,
+        'encoder_attention_dropout': 0.1,
+        'encoder_ff_interm_features': 3072,
+        'encoder_ff_interm_dropout': 0.0,
+        'encoder_dropout': 0.1,
+        'encoder_layer_norm_first': False,
+        'encoder_layer_drop': 0.05,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+
+WAV2VEC2_ASR_BASE_100H.__doc__ = """Build "base" wav2vec2 model with an extra linear module
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_BASE_960H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_base_ls960_asr_ls960.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_BASE_960H.__doc__ = """Build "base" wav2vec2 model with an extra linear module
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_LARGE = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_LARGE.__doc__ = """Build "large" wav2vec2 model.
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+Not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_LARGE_10M = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960_asr_ll10m.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_10M.__doc__ = """Build "large" wav2vec2 model with an extra linear module
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+[:footcite:`librilight`] ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_LARGE_100H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960_asr_ls100.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_100H.__doc__ = """Build "large" wav2vec2 model with an extra linear module
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from
+the same dataset ("train-clean-100" subset).
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_LARGE_960H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_ls960_asr_ls960.pth',
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_960H.__doc__ = """Build "large" wav2vec2 model with an extra linear module
+
+Pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_LARGE_LV60K = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_LARGE_LV60K.__doc__ = """Build "large-lv60k" wav2vec2 model.
+
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`].
+Not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k_asr_ll10m.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
+
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`], and
+fine-tuned for ASR on 10 minutes of transcribed audio from
+the same dataset ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k_asr_ls100.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
+
+Pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset [:footcite:`librilight`], and
+fine-tuned for ASR on 100 hours of transcribed audio from
+*LibriSpeech* dataset [:footcite:`7178964`] ("train-clean-100" subset).
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_lv60k_asr_ls960.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 32,
+    },
+    _labels=_get_labels(),
+)
+WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Build "large-lv60k" wav2vec2 model with an extra linear module
+
+Pre-trained on 60,000 hours of unlabeled audio from *Libri-Light*
+[:footcite:`librilight`] dataset, and
+fine-tuned for ASR on 960 hours of transcribed audio from
+*LibriSpeech* dataset [:footcite:`7178964`]
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+
+Originally published by the authors of *wav2vec 2.0*
+[:footcite:`baevski2020wav2vec`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
+WAV2VEC2_XLSR53 = Wav2Vec2PretrainedModelBundle(
+    'wav2vec2_fairseq_large_xlsr53.pth',
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _labels=None,
+)
+WAV2VEC2_XLSR53.__doc__ = """wav2vec 2.0 model with "Base" configuration.
+
+Trained on 56,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* [:footcite:`Pratap_2020`],
+*CommonVoice* [:footcite:`ardila2020common`] and
+*BABEL* [:footcite:`Gales2014SpeechRA`]).
+Not fine-tuned.
+
+Originally published by the authors of
+*Unsupervised Cross-lingual Representation Learning for Speech Recognition*
+[:footcite:`conneau2020unsupervised`].
+[`Source <https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#pre-trained-models>`__]
+"""
+
 HUBERT_BASE = Wav2Vec2PretrainedModelBundle(
     'hubert_fairseq_base_ls960.pth',
     {