From 418e6bbf98eaa2dcdab832068ee27af778fc2376 Mon Sep 17 00:00:00 2001
From: NielsRogge <niels.rogge1@gmail.com>
Date: Fri, 20 May 2022 14:43:35 +0000
Subject: [PATCH 01/42] First draft

---
 docs/source/en/index.mdx                      |   1 +
 docs/source/en/model_doc/videomae.mdx         |  47 ++
 src/transformers/__init__.py                  |  25 +
 src/transformers/models/__init__.py           |   1 +
 .../models/auto/configuration_auto.py         |   3 +
 .../models/auto/feature_extraction_auto.py    |   1 +
 src/transformers/models/auto/modeling_auto.py |   2 +
 src/transformers/models/videomae/__init__.py  |  65 +++
 .../models/videomae/configuration_videomae.py | 142 +++++
 .../videomae/convert_videomae_to_pytorch.py   | 178 ++++++
 .../models/videomae/modeling_videomae.py      | 543 ++++++++++++++++++
 src/transformers/models/videomae/test.py      |  10 +
 src/transformers/utils/dummy_pt_objects.py    |  31 +
 tests/models/videomae/__init__.py             |   0
 .../models/videomae/test_modeling_videomae.py | 443 ++++++++++++++
 15 files changed, 1492 insertions(+)
 create mode 100644 docs/source/en/model_doc/videomae.mdx
 create mode 100644 src/transformers/models/videomae/__init__.py
 create mode 100644 src/transformers/models/videomae/configuration_videomae.py
 create mode 100644 src/transformers/models/videomae/convert_videomae_to_pytorch.py
 create mode 100644 src/transformers/models/videomae/modeling_videomae.py
 create mode 100644 src/transformers/models/videomae/test.py
 create mode 100644 tests/models/videomae/__init__.py
 create mode 100644 tests/models/videomae/test_modeling_videomae.py
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index e8c3ed2928a7..c7edfbb242e0 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -299,6 +299,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.mdx
new file mode 100644
index 000000000000..6bfec2a8e936
--- /dev/null
+++ b/docs/source/en/model_doc/videomae.mdx
@@ -0,0 +1,47 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VideoMAE
+
+## Overview
+
+The VideoMAE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## VideoMAEConfig
+
+[[autodoc]] VideoMAEConfig
+
+
+## VideoMAEModel
+
+[[autodoc]] VideoMAEModel
+    - forward
+
+
+## VideoMAEForPreTraining
+
+[[autodoc]] transformers.VideoMAEForPreTraining
+    - forward
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 75784ce46376..d8cd32962ff1 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -341,6 +341,7 @@
         "UniSpeechSatConfig",
     ],
     "models.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
+    "models.videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
     "models.vilt": ["VILT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViltConfig", "ViltFeatureExtractor", "ViltProcessor"],
     "models.vision_encoder_decoder": ["VisionEncoderDecoderConfig"],
     "models.vision_text_dual_encoder": ["VisionTextDualEncoderConfig", "VisionTextDualEncoderProcessor"],
@@ -1871,6 +1872,15 @@
             "ViTMAEPreTrainedModel",
         ]
     )
+    _import_structure["models.videomae"].extend(
+        [
+            "VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VideoMAEForPreTraining",
+            "VideoMAELayer",
+            "VideoMAEModel",
+            "VideoMAEPreTrainedModel",
+        ]
+    )
     _import_structure["models.wav2vec2"].extend(
         [
             "WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2535,6 +2545,13 @@
             "TFViTMAEPreTrainedModel",
         ]
     )
+    _import_structure["models.videomae"].extend(
+        [
+            "TFVideoMAEForPreTraining",
+            "TFVideoMAEModel",
+            "TFVideoMAEPreTrainedModel",
+        ]
+    )
     _import_structure["models.wav2vec2"].extend(
         [
             "TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3100,6 +3117,7 @@
     from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
     from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
     from .models.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
+    from .models.videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
     from .models.vilt import VILT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViltConfig, ViltFeatureExtractor, ViltProcessor
     from .models.vision_encoder_decoder import VisionEncoderDecoderConfig
     from .models.vision_text_dual_encoder import VisionTextDualEncoderConfig, VisionTextDualEncoderProcessor
@@ -4338,6 +4356,13 @@
             VanModel,
             VanPreTrainedModel,
         )
+        from .models.videomae import (
+            VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VideoMAEForPreTraining,
+            VideoMAELayer,
+            VideoMAEModel,
+            VideoMAEPreTrainedModel,
+        )
         from .models.vilt import (
             VILT_PRETRAINED_MODEL_ARCHIVE_LIST,
             ViltForImageAndTextRetrieval,
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 1b81ce7d8fab..11887db91f83 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -137,6 +137,7 @@
     unispeech,
     unispeech_sat,
     van,
+    videomae,
     vilt,
     vision_encoder_decoder,
     vision_text_dual_encoder,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 13f69c024a9a..d8ecbb49e64f 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -132,6 +132,7 @@
         ("unispeech", "UniSpeechConfig"),
         ("unispeech-sat", "UniSpeechSatConfig"),
         ("van", "VanConfig"),
+        ("videomae", "VideoMAEConfig"),
         ("vilt", "ViltConfig"),
         ("vision-encoder-decoder", "VisionEncoderDecoderConfig"),
         ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
@@ -247,6 +248,7 @@
         ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech-sat", "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("videomae", "VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vilt", "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -388,6 +390,7 @@
         ("unispeech", "UniSpeech"),
         ("unispeech-sat", "UniSpeechSat"),
         ("van", "VAN"),
+        ("videomae", "VideoMAE"),
         ("vilt", "ViLT"),
         ("vision-encoder-decoder", "Vision Encoder decoder"),
         ("vision-text-dual-encoder", "VisionTextDualEncoder"),
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 8c4564e261c6..f77633e56c60 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -69,6 +69,7 @@
         ("swinv2", "ViTFeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),
         ("vilt", "ViltFeatureExtractor"),
+        ("videomae", "ViTFeatureExtractor"),
         ("vit", "ViTFeatureExtractor"),
         ("vit_mae", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a86e8bc56da3..4de2ee393488 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -127,6 +127,7 @@
         ("unispeech", "UniSpeechModel"),
         ("unispeech-sat", "UniSpeechSatModel"),
         ("van", "VanModel"),
+        ("videomae", "VideoMAEModel"),
         ("vilt", "ViltModel"),
         ("vision-text-dual-encoder", "VisionTextDualEncoderModel"),
         ("visual_bert", "VisualBertModel"),
@@ -187,6 +188,7 @@
         ("transfo-xl", "TransfoXLLMHeadModel"),
         ("unispeech", "UniSpeechForPreTraining"),
         ("unispeech-sat", "UniSpeechSatForPreTraining"),
+        ("videomae", "VideoMAEForPreTraining"),
         ("visual_bert", "VisualBertForPreTraining"),
         ("vit_mae", "ViTMAEForPreTraining"),
         ("wav2vec2", "Wav2Vec2ForPreTraining"),
diff --git a/src/transformers/models/videomae/__init__.py b/src/transformers/models/videomae/__init__.py
new file mode 100644
index 000000000000..f9ca1898af35
--- /dev/null
+++ b/src/transformers/models/videomae/__init__.py
@@ -0,0 +1,65 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_videomae"] = [
+        "VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VideoMAEForPreTraining",
+        "VideoMAELayer",
+        "VideoMAEModel",
+        "VideoMAEPreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_videomae import (
+            VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VideoMAEForPreTraining,
+            VideoMAELayer,
+            VideoMAEModel,
+            VideoMAEPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
new file mode 100644
index 000000000000..6c2951547359
--- /dev/null
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -0,0 +1,142 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ViT MAE model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "nanjing/videomae-base": "https://huggingface.co/nanjing/videomae-base/resolve/main/config.json",
+}
+
+
+
+class VideoMAEConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VideoMAEModel`]. It is used to instantiate an ViT
+    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
+    the defaults will yield a similar configuration to that of the ViT
+    [nanjing/videomae-base](https://huggingface.co/nanjing/videomae-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        mask_ratio (`float`, *optional*, defaults to 0.75):
+            The ratio of the number of masked tokens in the input sequence.
+        norm_pix_loss (`bool`, *optional*, defaults to `False`):
+            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
+            representation quality in the experiments of the authors.
+
+    Example:
+
+    ```python
+    >>> from transformers import VideoMAEModel, VideoMAEConfig
+
+    >>> # Initializing a ViT MAE vit-mae-base style configuration
+    >>> configuration = VideoMAEConfig()
+
+    >>> # Initializing a model from the vit-mae-base style configuration
+    >>> model = VideoMAEModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "videomae"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        is_encoder_decoder=False,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        mask_ratio=0.75,
+        norm_pix_loss=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.mask_ratio = mask_ratio
+        self.norm_pix_loss = norm_pix_loss
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
new file mode 100644
index 000000000000..e4e1e0882bc4
--- /dev/null
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ViT MAE checkpoints from the original repository: https://github.com/facebookresearch/mae"""
+
+import argparse
+
+import torch
+from PIL import Image
+
+import requests
+from transformers import VideoMAEConfig, VideoMAEFeatureExtractor, VideoMAEForPreTraining
+
+
+def rename_key(name):
+    if "cls_token" in name:
+        name = name.replace("cls_token", "vit.embeddings.cls_token")
+    if "mask_token" in name:
+        name = name.replace("mask_token", "decoder.mask_token")
+    if "decoder_pos_embed" in name:
+        name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
+    if "pos_embed" in name and "decoder" not in name:
+        name = name.replace("pos_embed", "vit.embeddings.position_embeddings")
+    if "patch_embed.proj" in name:
+        name = name.replace("patch_embed.proj", "vit.embeddings.patch_embeddings.projection")
+    if "patch_embed.norm" in name:
+        name = name.replace("patch_embed.norm", "vit.embeddings.norm")
+    if "decoder_blocks" in name:
+        name = name.replace("decoder_blocks", "decoder.decoder_layers")
+    if "blocks" in name:
+        name = name.replace("blocks", "vit.encoder.layer")
+    if "attn.proj" in name:
+        name = name.replace("attn.proj", "attention.output.dense")
+    if "attn" in name:
+        name = name.replace("attn", "attention.self")
+    if "norm1" in name:
+        name = name.replace("norm1", "layernorm_before")
+    if "norm2" in name:
+        name = name.replace("norm2", "layernorm_after")
+    if "mlp.fc1" in name:
+        name = name.replace("mlp.fc1", "intermediate.dense")
+    if "mlp.fc2" in name:
+        name = name.replace("mlp.fc2", "output.dense")
+    if "decoder_embed" in name:
+        name = name.replace("decoder_embed", "decoder.decoder_embed")
+    if "decoder_norm" in name:
+        name = name.replace("decoder_norm", "decoder.decoder_norm")
+    if "decoder_pred" in name:
+        name = name.replace("decoder_pred", "decoder.decoder_pred")
+    if "norm.weight" in name and "decoder" not in name:
+        name = name.replace("norm.weight", "vit.layernorm.weight")
+    if "norm.bias" in name and "decoder" not in name:
+        name = name.replace("norm.bias", "vit.layernorm.bias")
+
+    return name
+
+
+def convert_state_dict(orig_state_dict, config):
+    for key in orig_state_dict.copy().keys():
+        val = orig_state_dict.pop(key)
+
+        if "qkv" in key:
+            key_split = key.split(".")
+            layer_num = int(key_split[1])
+            if "decoder_blocks" in key:
+                dim = config.decoder_hidden_size
+                prefix = "decoder.decoder_layers."
+                if "weight" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+                elif "bias" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
+            else:
+                dim = config.hidden_size
+                prefix = "vit.encoder.layer."
+                if "weight" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
+                elif "bias" in key:
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
+
+        else:
+            orig_state_dict[rename_key(key)] = val
+
+    return orig_state_dict
+
+
+def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
+    config = VideoMAEConfig()
+    if "large" in checkpoint_url:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+    elif "huge" in checkpoint_url:
+        config.patch_size = 14
+        config.hidden_size = 1280
+        config.intermediate_size = 5120
+        config.num_hidden_layers = 32
+        config.num_attention_heads = 16
+
+    model = VideoMAEForPreTraining(config)
+
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
+
+    feature_extractor = VideoMAEFeatureExtractor(size=config.image_size)
+
+    new_state_dict = convert_state_dict(state_dict, config)
+
+    model.load_state_dict(new_state_dict)
+    model.eval()
+
+    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
+
+    image = Image.open(requests.get(url, stream=True).raw)
+    feature_extractor = VideoMAEFeatureExtractor(size=config.image_size)
+    inputs = feature_extractor(images=image, return_tensors="pt")
+
+    # forward pass
+    torch.manual_seed(2)
+    outputs = model(**inputs)
+    logits = outputs.logits
+
+    if "large" in checkpoint_url:
+        expected_slice = torch.tensor(
+            [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]]
+        )
+    elif "huge" in checkpoint_url:
+        expected_slice = torch.tensor(
+            [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]]
+        )
+    else:
+        expected_slice = torch.tensor(
+            [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]]
+        )
+
+    # verify logits
+    assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--checkpoint_url",
+        default="https://dl.fbaipublicfiles.com/mae/visualize/mae_visualize_vit_base.pth",
+        type=str,
+        help="URL of the checkpoint you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+    )
+
+    args = parser.parse_args()
+    convert_videomae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
new file mode 100644
index 000000000000..9c64268df315
--- /dev/null
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -0,0 +1,543 @@
+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch ViT MAE (masked autoencoder) model."""
+
+
+import collections.abc
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional, Set, Tuple, Union
+
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_videomae import VideoMAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "VideoMAEConfig"
+_CHECKPOINT_FOR_DOC = "nanjing/videomae-base"
+
+VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "nanjing/videomae-base",
+    # See all VideoMAE models at https://huggingface.co/models?filter=videomae
+]
+
+
+# sin-cos position encoding
+# https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
+def get_sinusoid_encoding_table(n_position, d_hid): 
+    ''' Sinusoid position encoding table ''' 
+    # TODO: make it with torch instead of numpy 
+    def get_position_angle_vec(position): 
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] 
+
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) 
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 
+
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0) 
+
+
+class VideoMAEEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings.
+
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = PatchEmbeddings(
+            image_size=config.image_size,
+            patch_size=config.patch_size,
+            num_channels=config.num_channels,
+            embed_dim=config.hidden_size,
+        )
+        num_patches = self.patch_embeddings.num_patches
+        # fixed sin-cos embedding
+        self.position_embeddings = get_sinusoid_encoding_table(num_patches, config.hidden_size)
+        self.config = config
+
+    def forward(self, pixel_values):
+        # create patch embeddings
+        embeddings = self.patch_embeddings(pixel_values)
+
+        # add position embeddings
+        embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
+
+        return embeddings
+
+
+class PatchEmbeddings(nn.Module):
+    """
+    Video to Patch Embedding.
+
+    """
+
+    def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768, num_frames=16, tubelet_size=2):
+        super().__init__()
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.tubelet_size = int(tubelet_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
+        self.num_patches = num_patches
+        self.projection = nn.Conv3d(in_channels=num_channels, out_channels=embed_dim, 
+                            kernel_size = (self.tubelet_size,  patch_size[0],patch_size[1]), 
+                            stride=(self.tubelet_size,  patch_size[0],  patch_size[1]))
+
+    def forward(self, pixel_values):
+        batch_size, num_channels, time, height, width = pixel_values.shape
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        print("Shape of embeddings:", x.shape)
+        return x
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention ViT->VideoMAE
+class VideoMAESelfAttention(nn.Module):
+    def __init__(self, config: VideoMAEConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->VideoMAE
+class VideoMAESelfOutput(nn.Module):
+    """
+    The residual connection is defined in VideoMAELayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: VideoMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->VideoMAE
+class VideoMAEAttention(nn.Module):
+    def __init__(self, config: VideoMAEConfig) -> None:
+        super().__init__()
+        self.attention = VideoMAESelfAttention(config)
+        self.output = VideoMAESelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->VideoMAE
+class VideoMAEIntermediate(nn.Module):
+    def __init__(self, config: VideoMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTOutput ViT->VideoMAE
+class VideoMAEOutput(nn.Module):
+    def __init__(self, config: VideoMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->VideoMAE
+class VideoMAELayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: VideoMAEConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VideoMAEAttention(config)
+        self.intermediate = VideoMAEIntermediate(config)
+        self.output = VideoMAEOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in VideoMAE, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+
+        # in VideoMAE, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->VideoMAE
+class VideoMAEEncoder(nn.Module):
+    def __init__(self, config: VideoMAEConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([VideoMAELayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    layer_head_mask,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.vit_mae.modeling_vit_mae.ViTMAEPreTrainedModel with ViTMAE->VideoMAE
+class VideoMAEPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = VideoMAEConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, VideoMAEEncoder):
+            module.gradient_checkpointing = value
+
+
+VIDEOMAE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VideoMAEConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIDEOMAE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare VideoMAE Model transformer outputting raw hidden-states without any specific head on top.",
+    VIDEOMAE_START_DOCSTRING,
+)
+class VideoMAEModel(VideoMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = VideoMAEEmbeddings(config)
+        self.encoder = VideoMAEEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        noise=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoFeatureExtractor, VideoMAEModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("nanjing/videomae-base")
+        >>> model = VideoMAEModel.from_pretrained("nanjing/videomae-base")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
\ No newline at end of file
diff --git a/src/transformers/models/videomae/test.py b/src/transformers/models/videomae/test.py
new file mode 100644
index 000000000000..634937ef4152
--- /dev/null
+++ b/src/transformers/models/videomae/test.py
@@ -0,0 +1,10 @@
+import torch
+from transformers import VideoMAEConfig, VideoMAEModel
+
+model = VideoMAEModel(VideoMAEConfig())
+
+pixel_values = torch.randn(1,3,16,224,224)
+
+outputs = model(pixel_values)
+
+print(outputs.keys())
\ No newline at end of file
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c1dfc6b6b7ca..3012da64e253 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4753,6 +4753,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VideoMAEForPreTraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VideoMAELayer(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VideoMAEModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VideoMAEPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 VILT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/videomae/__init__.py b/tests/models/videomae/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
new file mode 100644
index 000000000000..24fe543ff655
--- /dev/null
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -0,0 +1,443 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch VideoMAE model. """
+
+
+import inspect
+import math
+import tempfile
+import unittest
+
+import numpy as np
+
+from transformers import VideoMAEConfig
+from transformers.testing_utils import require_torch, require_vision, slow, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+    from torch import nn
+
+    from transformers import VideoMAEForPreTraining, VideoMAEModel
+    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import ViTFeatureExtractor
+
+
+class VideoMAEModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=30,
+        patch_size=2,
+        num_channels=3,
+        is_training=True,
+        use_labels=True,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        type_sequence_label_size=10,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+
+        config = self.get_config()
+
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return VideoMAEConfig(
+            image_size=self.image_size,
+            patch_size=self.patch_size,
+            num_channels=self.num_channels,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = VideoMAEModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected sequence length = (num_patches + 1) * (1 - config.mask_ratio), rounded above
+        # (we add 1 for the [CLS] token)
+        image_size = to_2tuple(self.image_size)
+        patch_size = to_2tuple(self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        expected_seq_len = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, self.hidden_size))
+
+    def create_and_check_for_pretraining(self, config, pixel_values, labels):
+        model = VideoMAEForPreTraining(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+        # expected sequence length = num_patches
+        image_size = to_2tuple(self.image_size)
+        patch_size = to_2tuple(self.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        expected_seq_len = num_patches
+        expected_num_channels = self.patch_size**2 * self.num_channels
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, expected_seq_len, expected_num_channels))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_torch
+class VideoMAEModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as VideoMAE does not use input_ids, inputs_embeds,
+    attention_mask and seq_length.
+    """
+
+    all_model_classes = (VideoMAEModel, VideoMAEForPreTraining) if is_torch_available() else ()
+
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VideoMAEModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VideoMAEConfig, has_text_modality=False, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_inputs_embeds(self):
+        # VideoMAE does not use inputs_embeds
+        pass
+
+    def test_model_common_attributes(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
+            x = model.get_output_embeddings()
+            self.assertTrue(x is None or isinstance(x, nn.Linear))
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        # in VideoMAE, the seq_len equals (number of patches + 1) * (1 - mask_ratio), rounded above
+        image_size = to_2tuple(self.model_tester.image_size)
+        patch_size = to_2tuple(self.model_tester.patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        seq_len = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        chunk_length = getattr(self.model_tester, "chunk_length", None)
+        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
+            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            if chunk_length is not None:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-4:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
+                )
+            else:
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+                )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # VideoMAE has a different seq_length
+            image_size = to_2tuple(self.model_tester.image_size)
+            patch_size = to_2tuple(self.model_tester.patch_size)
+            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+            seq_length = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # overwrite from common since VideoMAEForPretraining has random masking, we need to fix the noise
+    # to generate masks during test
+    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
+
+        # make masks reproducible
+        np.random.seed(2)
+
+        num_patches = int((pt_model.config.image_size // pt_model.config.patch_size) ** 2)
+        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
+        pt_noise = torch.from_numpy(noise)
+
+        # Add `noise` argument.
+        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
+        pt_inputs_dict["noise"] = pt_noise
+
+        super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
+
+    def test_save_load(self):
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            # make random mask reproducible
+            torch.manual_seed(2)
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            out_2 = outputs[0].cpu().numpy()
+            out_2[np.isnan(out_2)] = 0
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname)
+                model.to(torch_device)
+                # make random mask reproducible
+                torch.manual_seed(2)
+                with torch.no_grad():
+                    after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+                # Make sure we don't have nans
+                out_1 = after_outputs[0].cpu().numpy()
+                out_1[np.isnan(out_1)] = 0
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
+
+    @unittest.skip(
+        reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load
+    to get deterministic results."""
+    )
+    def test_determinism(self):
+        pass
+
+    @unittest.skip(
+        reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load
+    to get deterministic results."""
+    )
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(
+        reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load
+    to get deterministic results."""
+    )
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @unittest.skip(reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = VideoMAEModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_torch
+@require_vision
+class VideoMAEModelIntegrationTest(unittest.TestCase):
+    @cached_property
+    def default_feature_extractor(self):
+        return ViTFeatureExtractor.from_pretrained("nanjing/videomae-base") if is_vision_available() else None
+
+    @slow
+    def test_inference_for_pretraining(self):
+        # make random mask reproducible across the PT and TF model
+        np.random.seed(2)
+
+        model = VideoMAEForPreTraining.from_pretrained("nanjing/videomae-base").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        image = prepare_img()
+        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
+
+        # prepare a noise vector that will be also used for testing the TF model
+        # (this way we can ensure that the PT and TF models operate on the same inputs)
+        videomae_config = VideoMAEConfig()
+        num_patches = int((videomae_config.image_size // videomae_config.patch_size) ** 2)
+        noise = np.random.uniform(size=(1, num_patches))
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
+
+        # verify the logits
+        expected_shape = torch.Size((1, 196, 768))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[-0.0548, -1.7023, -0.9325], [0.3721, -0.5670, -0.2233], [0.8235, -1.3878, -0.3524]]
+        )
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice.to(torch_device), atol=1e-4))

From e32d1a5ab0c3bd47f22c47fc3d0c81153732f42b Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 22 May 2022 11:52:56 +0200
Subject: [PATCH 02/42] Add VideoMAEForVideoClassification

---
 docs/source/en/index.mdx                      |   2 +-
 docs/source/en/model_doc/videomae.mdx         |   5 +
 src/transformers/__init__.py                  |   9 +-
 src/transformers/models/auto/modeling_auto.py |  16 ++
 src/transformers/models/videomae/__init__.py  |   8 +-
 .../models/videomae/configuration_videomae.py |   5 +-
 .../videomae/convert_videomae_to_pytorch.py   | 108 +++++------
 .../models/videomae/modeling_videomae.py      | 183 ++++++++++++++----
 src/transformers/models/videomae/test.py      |   6 +-
 src/transformers/utils/dummy_pt_objects.py    |   7 +
 .../models/videomae/test_modeling_videomae.py |   6 +-
 11 files changed, 248 insertions(+), 107 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index c7edfbb242e0..d3d5f8d411b5 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -299,7 +299,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             VAN             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          VideoMAE           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.mdx
index 6bfec2a8e936..c36f40dd52f2 100644
--- a/docs/source/en/model_doc/videomae.mdx
+++ b/docs/source/en/model_doc/videomae.mdx
@@ -45,3 +45,8 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 [[autodoc]] transformers.VideoMAEForPreTraining
     - forward
 
+
+## VideoMAEForVideoClassification
+
+[[autodoc]] transformers.VideoMAEForVideoClassification
+    - forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d8cd32962ff1..68ed1b3ca3eb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1879,6 +1879,7 @@
             "VideoMAELayer",
             "VideoMAEModel",
             "VideoMAEPreTrainedModel",
+            "VideoMAEForVideoClassification",
         ]
     )
     _import_structure["models.wav2vec2"].extend(
@@ -2545,13 +2546,6 @@
             "TFViTMAEPreTrainedModel",
         ]
     )
-    _import_structure["models.videomae"].extend(
-        [
-            "TFVideoMAEForPreTraining",
-            "TFVideoMAEModel",
-            "TFVideoMAEPreTrainedModel",
-        ]
-    )
     _import_structure["models.wav2vec2"].extend(
         [
             "TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4359,6 +4353,7 @@
         from .models.videomae import (
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
+            VideoMAEForVideoClassification,
             VideoMAELayer,
             VideoMAEModel,
             VideoMAEPreTrainedModel,
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index 4de2ee393488..bd4774c245b0 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -383,6 +383,12 @@
     ]
 )
 
+MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("videomae", "VideoMAEForVideoClassification"),
+    ]
+)
+
 MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict(
     [
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
@@ -756,6 +762,9 @@
 MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES
 )
+MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
+)
 MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES)
 MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
     CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
@@ -940,6 +949,13 @@ class AutoModelForObjectDetection(_BaseAutoModelClass):
 AutoModelForObjectDetection = auto_class_update(AutoModelForObjectDetection, head_doc="object detection")
 
 
+class AutoModelForVideoClassification(_BaseAutoModelClass):
+    _model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING
+
+
+AutoModelForVideoClassification = auto_class_update(AutoModelForVideoClassification, head_doc="video classification")
+
+
 class AutoModelForVision2Seq(_BaseAutoModelClass):
     _model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING
 
diff --git a/src/transformers/models/videomae/__init__.py b/src/transformers/models/videomae/__init__.py
index f9ca1898af35..87ceabf002d6 100644
--- a/src/transformers/models/videomae/__init__.py
+++ b/src/transformers/models/videomae/__init__.py
@@ -17,11 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    is_torch_available,
-)
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
 
 
 _import_structure = {
@@ -40,6 +36,7 @@
         "VideoMAELayer",
         "VideoMAEModel",
         "VideoMAEPreTrainedModel",
+        "VideoMAEForVideoClassification",
     ]
 
 if TYPE_CHECKING:
@@ -54,6 +51,7 @@
         from .modeling_videomae import (
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
+            VideoMAEForVideoClassification,
             VideoMAELayer,
             VideoMAEModel,
             VideoMAEPreTrainedModel,
diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index 6c2951547359..11929d2356e8 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -25,7 +25,6 @@
 }
 
 
-
 class VideoMAEConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`VideoMAEModel`]. It is used to instantiate an ViT
@@ -65,6 +64,8 @@ class VideoMAEConfig(PretrainedConfig):
             The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
+        use_mean_pooling (`bool`, *optional*, defaults to `True`):
+            Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
         decoder_num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the decoder.
         decoder_hidden_size (`int`, *optional*, defaults to 512):
@@ -111,6 +112,7 @@ def __init__(
         patch_size=16,
         num_channels=3,
         qkv_bias=True,
+        use_mean_pooling=True,
         decoder_num_attention_heads=16,
         decoder_hidden_size=512,
         decoder_num_hidden_layers=8,
@@ -134,6 +136,7 @@ def __init__(
         self.patch_size = patch_size
         self.num_channels = num_channels
         self.qkv_bias = qkv_bias
+        self.use_mean_pooling = use_mean_pooling
         self.decoder_num_attention_heads = decoder_num_attention_heads
         self.decoder_hidden_size = decoder_hidden_size
         self.decoder_num_hidden_layers = decoder_num_hidden_layers
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index e4e1e0882bc4..2e9946c805bb 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -17,33 +17,33 @@
 import argparse
 
 import torch
-from PIL import Image
 
-import requests
-from transformers import VideoMAEConfig, VideoMAEFeatureExtractor, VideoMAEForPreTraining
+from transformers import VideoMAEConfig, VideoMAEForVideoClassification
 
 
 def rename_key(name):
     if "cls_token" in name:
-        name = name.replace("cls_token", "vit.embeddings.cls_token")
+        name = name.replace("cls_token", "videomae.embeddings.cls_token")
     if "mask_token" in name:
         name = name.replace("mask_token", "decoder.mask_token")
     if "decoder_pos_embed" in name:
         name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
     if "pos_embed" in name and "decoder" not in name:
-        name = name.replace("pos_embed", "vit.embeddings.position_embeddings")
+        name = name.replace("pos_embed", "videomae.embeddings.position_embeddings")
     if "patch_embed.proj" in name:
-        name = name.replace("patch_embed.proj", "vit.embeddings.patch_embeddings.projection")
+        name = name.replace("patch_embed.proj", "videomae.embeddings.patch_embeddings.projection")
     if "patch_embed.norm" in name:
-        name = name.replace("patch_embed.norm", "vit.embeddings.norm")
+        name = name.replace("patch_embed.norm", "videomae.embeddings.norm")
     if "decoder_blocks" in name:
         name = name.replace("decoder_blocks", "decoder.decoder_layers")
     if "blocks" in name:
-        name = name.replace("blocks", "vit.encoder.layer")
+        name = name.replace("blocks", "videomae.encoder.layer")
     if "attn.proj" in name:
         name = name.replace("attn.proj", "attention.output.dense")
-    if "attn" in name:
+    if "attn" in name and "bias" not in name:
         name = name.replace("attn", "attention.self")
+    if "attn" in name:
+        name = name.replace("attn", "attention.attention")
     if "norm1" in name:
         name = name.replace("norm1", "layernorm_before")
     if "norm2" in name:
@@ -58,10 +58,12 @@ def rename_key(name):
         name = name.replace("decoder_norm", "decoder.decoder_norm")
     if "decoder_pred" in name:
         name = name.replace("decoder_pred", "decoder.decoder_pred")
-    if "norm.weight" in name and "decoder" not in name:
-        name = name.replace("norm.weight", "vit.layernorm.weight")
-    if "norm.bias" in name and "decoder" not in name:
-        name = name.replace("norm.bias", "vit.layernorm.bias")
+    if "norm.weight" in name and "decoder" not in name and "fc" not in name:
+        name = name.replace("norm.weight", "videomae.layernorm.weight")
+    if "norm.bias" in name and "decoder" not in name and "fc" not in name:
+        name = name.replace("norm.bias", "videomae.layernorm.bias")
+    if "head" in name:
+        name = name.replace("head", "classifier")
 
     return name
 
@@ -80,21 +82,22 @@ def convert_state_dict(orig_state_dict, config):
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                elif "bias" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
+                # elif "bias" in key:
+                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
+                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
             else:
                 dim = config.hidden_size
-                prefix = "vit.encoder.layer."
+                prefix = "videomae.encoder.layer."
                 if "weight" in key:
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                elif "bias" in key:
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                    orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
+                # elif "bias" in key:
+                #     print("hello we're here")
+                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
+                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
+                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
 
         else:
             orig_state_dict[rename_key(key)] = val
@@ -116,49 +119,42 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
         config.num_hidden_layers = 32
         config.num_attention_heads = 16
 
-    model = VideoMAEForPreTraining(config)
+    config.num_labels = 400
 
-    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"]
-
-    feature_extractor = VideoMAEFeatureExtractor(size=config.image_size)
+    model = VideoMAEForVideoClassification(config)
 
+    state_dict = torch.load(checkpoint_url, map_location="cpu")["module"]
     new_state_dict = convert_state_dict(state_dict, config)
 
     model.load_state_dict(new_state_dict)
     model.eval()
 
-    url = "https://user-images.githubusercontent.com/11435359/147738734-196fd92f-9260-48d5-ba7e-bf103d29364d.jpg"
-
-    image = Image.open(requests.get(url, stream=True).raw)
-    feature_extractor = VideoMAEFeatureExtractor(size=config.image_size)
-    inputs = feature_extractor(images=image, return_tensors="pt")
+    # # forward pass
+    # torch.manual_seed(2)
+    # outputs = model(**inputs)
+    # logits = outputs.logits
 
-    # forward pass
-    torch.manual_seed(2)
-    outputs = model(**inputs)
-    logits = outputs.logits
-
-    if "large" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]]
-        )
-    elif "huge" in checkpoint_url:
-        expected_slice = torch.tensor(
-            [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]]
-        )
-    else:
-        expected_slice = torch.tensor(
-            [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]]
-        )
+    # if "large" in checkpoint_url:
+    #     expected_slice = torch.tensor(
+    #         [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]]
+    #     )
+    # elif "huge" in checkpoint_url:
+    #     expected_slice = torch.tensor(
+    #         [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]]
+    #     )
+    # else:
+    #     expected_slice = torch.tensor(
+    #         [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]]
+    #     )
 
-    # verify logits
-    assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
+    # # verify logits
+    # assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
 
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
+    # print(f"Saving model to {pytorch_dump_folder_path}")
+    # model.save_pretrained(pytorch_dump_folder_path)
 
-    print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
+    # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
 
 if __name__ == "__main__":
@@ -166,9 +162,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     # Required parameters
     parser.add_argument(
         "--checkpoint_url",
-        default="https://dl.fbaipublicfiles.com/mae/visualize/mae_visualize_vit_base.pth",
+        default="/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/checkpoint.pth",
         type=str,
-        help="URL of the checkpoint you'd like to convert.",
+        help="Path of the original PyTorch checkpoint you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 9c64268df315..9fd9793a9ba4 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -17,26 +17,19 @@
 
 import collections.abc
 import math
-from copy import deepcopy
-from dataclasses import dataclass
 from typing import Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_outputs import BaseModelOutput
+from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import (
-    ModelOutput,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    logging,
-    replace_return_docstrings,
-)
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_videomae import VideoMAEConfig
 
 
@@ -53,17 +46,17 @@
 
 # sin-cos position encoding
 # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
-def get_sinusoid_encoding_table(n_position, d_hid): 
-    ''' Sinusoid position encoding table ''' 
-    # TODO: make it with torch instead of numpy 
-    def get_position_angle_vec(position): 
-        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] 
+def get_sinusoid_encoding_table(n_position, d_hid):
+    """Sinusoid position encoding table"""
+    # TODO: make it with torch instead of numpy
+    def get_position_angle_vec(position):
+        return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
 
-    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) 
-    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i 
-    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 
+    sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
 
-    return torch.FloatTensor(sinusoid_table).unsqueeze(0) 
+    return torch.FloatTensor(sinusoid_table).unsqueeze(0)
 
 
 class VideoMAEEmbeddings(nn.Module):
@@ -109,11 +102,16 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768,
         self.image_size = image_size
         self.patch_size = patch_size
         self.tubelet_size = int(tubelet_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
+        num_patches = (
+            (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
+        )
         self.num_patches = num_patches
-        self.projection = nn.Conv3d(in_channels=num_channels, out_channels=embed_dim, 
-                            kernel_size = (self.tubelet_size,  patch_size[0],patch_size[1]), 
-                            stride=(self.tubelet_size,  patch_size[0],  patch_size[1]))
+        self.projection = nn.Conv3d(
+            in_channels=num_channels,
+            out_channels=embed_dim,
+            kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
+            stride=(self.tubelet_size, patch_size[0], patch_size[1]),
+        )
 
     def forward(self, pixel_values):
         batch_size, num_channels, time, height, width = pixel_values.shape
@@ -126,7 +124,6 @@ def forward(self, pixel_values):
         return x
 
 
-# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention ViT->VideoMAE
 class VideoMAESelfAttention(nn.Module):
     def __init__(self, config: VideoMAEConfig) -> None:
         super().__init__()
@@ -140,9 +137,16 @@ def __init__(self, config: VideoMAEConfig) -> None:
         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
         self.all_head_size = self.num_attention_heads * self.attention_head_size
 
-        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+
+        if config.qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(self.all_head_size))
+            self.v_bias = nn.Parameter(torch.zeros(self.all_head_size))
+        else:
+            self.q_bias = None
+            self.v_bias = None
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 
@@ -154,11 +158,15 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        mixed_query_layer = self.query(hidden_states)
+        
+        k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
+        keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
+        values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
+        queries = nn.functional.linear(input=hidden_states, weight=self.query.weight, bias=self.q_bias)
 
-        key_layer = self.transpose_for_scores(self.key(hidden_states))
-        value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(keys)
+        value_layer = self.transpose_for_scores(values)
+        query_layer = self.transpose_for_scores(queries)
 
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
@@ -458,7 +466,7 @@ def __init__(self, config):
         self.embeddings = VideoMAEEmbeddings(config)
         self.encoder = VideoMAEEncoder(config)
 
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm = nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -540,4 +548,113 @@ def forward(
             last_hidden_state=sequence_output,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
-        )
\ No newline at end of file
+        )
+
+
+@add_start_docstrings(
+    """VideoMAE Model transformer with a video classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.""",
+    VIDEOMAE_START_DOCSTRING,
+)
+class VideoMAEForVideoClassification(VideoMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.videomae = VideoMAEModel(config)
+
+        # Classifier head
+        self.fc_norm = nn.LayerNorm(config.hidden_size) if config.use_mean_pooling else None
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoFeatureExtractor, VideoMAEForVideoClassification
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("nanjing/videomae-base")
+        >>> model = VideoMAEForVideoClassification.from_pretrained("nanjing/videomae-base")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.video_mae(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        if self.fc_norm is not None:
+            return self.fc_norm(sequence_output.mean(1))
+        else:
+            return sequence_output[:, 0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/videomae/test.py b/src/transformers/models/videomae/test.py
index 634937ef4152..0c589e8a8e18 100644
--- a/src/transformers/models/videomae/test.py
+++ b/src/transformers/models/videomae/test.py
@@ -1,10 +1,12 @@
 import torch
+
 from transformers import VideoMAEConfig, VideoMAEModel
 
+
 model = VideoMAEModel(VideoMAEConfig())
 
-pixel_values = torch.randn(1,3,16,224,224)
+pixel_values = torch.randn(1, 3, 16, 224, 224)
 
 outputs = model(pixel_values)
 
-print(outputs.keys())
\ No newline at end of file
+print(outputs.keys())
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 3012da64e253..d994431a6c6c 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4763,6 +4763,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class VideoMAEForVideoClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class VideoMAELayer(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 24fe543ff655..b1c4792b9a6d 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -34,7 +34,7 @@
     import torch
     from torch import nn
 
-    from transformers import VideoMAEForPreTraining, VideoMAEModel
+    from transformers import VideoMAEForPreTraining, VideoMAEForVideoClassification, VideoMAEModel
     from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
 
 
@@ -151,7 +151,9 @@ class VideoMAEModelTest(ModelTesterMixin, unittest.TestCase):
     attention_mask and seq_length.
     """
 
-    all_model_classes = (VideoMAEModel, VideoMAEForPreTraining) if is_torch_available() else ()
+    all_model_classes = (
+        (VideoMAEModel, VideoMAEForPreTraining, VideoMAEForVideoClassification) if is_torch_available() else ()
+    )
 
     test_pruning = False
     test_torchscript = False

From ea8a6e6e6eb08e9533d940348a5a7d069c9fd23e Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 22 May 2022 12:24:50 +0200
Subject: [PATCH 03/42] Improve conversion script

---
 .../videomae/convert_videomae_to_pytorch.py   | 63 ++++++++++++-------
 .../models/videomae/modeling_videomae.py      |  8 +--
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 2e9946c805bb..3530fe38a37b 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -15,11 +15,32 @@
 """Convert ViT MAE checkpoints from the original repository: https://github.com/facebookresearch/mae"""
 
 import argparse
+import json
 
 import torch
 
 from transformers import VideoMAEConfig, VideoMAEForVideoClassification
 
+from huggingface_hub import hf_hub_download
+
+def get_videomae_config(checkpoint_path):
+    config = VideoMAEConfig()
+    
+    if "large" in checkpoint_path:
+        config.hidden_size = 1024
+        config.intermediate_size = 4096
+        config.num_hidden_layers = 24
+        config.num_attention_heads = 16
+
+    config.num_labels = 400
+    repo_id = "datasets/huggingface/label-files"
+    filename = "kinetics400-id2label.json"
+    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+    id2label = {int(k): v for k, v in id2label.items()}
+    config.id2label = id2label
+    config.label2id = {v: k for k, v in id2label.items()}
+
+    return config
 
 def rename_key(name):
     if "cls_token" in name:
@@ -105,34 +126,24 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
-def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
-    config = VideoMAEConfig()
-    if "large" in checkpoint_url:
-        config.hidden_size = 1024
-        config.intermediate_size = 4096
-        config.num_hidden_layers = 24
-        config.num_attention_heads = 16
-    elif "huge" in checkpoint_url:
-        config.patch_size = 14
-        config.hidden_size = 1280
-        config.intermediate_size = 5120
-        config.num_hidden_layers = 32
-        config.num_attention_heads = 16
-
-    config.num_labels = 400
+def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub):
+    config = get_videomae_config(checkpoint_path)
 
     model = VideoMAEForVideoClassification(config)
 
-    state_dict = torch.load(checkpoint_url, map_location="cpu")["module"]
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["module"]
     new_state_dict = convert_state_dict(state_dict, config)
 
     model.load_state_dict(new_state_dict)
     model.eval()
 
-    # # forward pass
-    # torch.manual_seed(2)
-    # outputs = model(**inputs)
-    # logits = outputs.logits
+    # forward pass
+    pixel_values = torch.load("/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/eating_spaghetti_video.pt")
+    outputs = model(pixel_values)
+    logits = outputs.logits
+
+    predicted_class_idx = logits.argmax(-1).item()
+    print("Predicted class:", model.config.id2label[predicted_class_idx])
 
     # if "large" in checkpoint_url:
     #     expected_slice = torch.tensor(
@@ -156,12 +167,17 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
     # feature_extractor.save_pretrained(pytorch_dump_folder_path)
 
+    if push_to_hub:
+        print("Pushing to the hub...")
+        model_name = "nielsr/videomae-base"
+        model.push_to_hub(model_name, organization="hustvl")
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--checkpoint_url",
+        "--checkpoint_path",
         default="/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/checkpoint.pth",
         type=str,
         help="Path of the original PyTorch checkpoint you'd like to convert.",
@@ -169,6 +185,9 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path):
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
+    parser.add_argument(
+        "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
+    )
 
     args = parser.parse_args()
-    convert_videomae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path)
+    convert_videomae_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 9fd9793a9ba4..f6745612212a 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -608,7 +608,7 @@ def forward(
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.video_mae(
+        outputs = self.videomae(
             pixel_values,
             head_mask=head_mask,
             output_attentions=output_attentions,
@@ -619,11 +619,11 @@ def forward(
         sequence_output = outputs[0]
 
         if self.fc_norm is not None:
-            return self.fc_norm(sequence_output.mean(1))
+            sequence_output = self.fc_norm(sequence_output.mean(1))
         else:
-            return sequence_output[:, 0]
+            sequence_output = sequence_output[:, 0]
 
-        logits = self.classifier(sequence_output[:, 0, :])
+        logits = self.classifier(sequence_output)
 
         loss = None
         if labels is not None:

From e52c3dbc9d055283b298145a435eba9b48621223 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sun, 22 May 2022 14:42:35 +0200
Subject: [PATCH 04/42] Add VideoMAEForPreTraining

---
 .../models/videomae/configuration_videomae.py |  60 ++---
 .../videomae/convert_videomae_to_pytorch.py   |   5 +-
 .../models/videomae/modeling_videomae.py      | 215 +++++++++++++++++-
 src/transformers/models/videomae/test.py      |   7 +-
 4 files changed, 247 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index 11929d2356e8..d3450c2ce06f 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -37,6 +37,16 @@ class VideoMAEConfig(PretrainedConfig):
 
 
     Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        num_frames (`int`, *optional*, defaults to 16):
+            The number of frames in each video.
+        tubelet_size (`int`, *optional*, defaults to 2):
+            The number of tubelets.
         hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
         num_hidden_layers (`int`, *optional*, defaults to 12):
@@ -56,29 +66,18 @@ class VideoMAEConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (`int`, *optional*, defaults to 224):
-            The size (resolution) of each image.
-        patch_size (`int`, *optional*, defaults to 16):
-            The size (resolution) of each patch.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
         qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
         use_mean_pooling (`bool`, *optional*, defaults to `True`):
             Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
-        decoder_num_attention_heads (`int`, *optional*, defaults to 12):
+        decoder_num_attention_heads (`int`, *optional*, defaults to 6):
             Number of attention heads for each attention layer in the decoder.
-        decoder_hidden_size (`int`, *optional*, defaults to 512):
+        decoder_hidden_size (`int`, *optional*, defaults to 384):
             Dimensionality of the decoder.
-        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the decoder.
-        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+        decoder_intermediate_size (`int`, *optional*, defaults to 1536):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
-        mask_ratio (`float`, *optional*, defaults to 0.75):
-            The ratio of the number of masked tokens in the input sequence.
-        norm_pix_loss (`bool`, *optional*, defaults to `False`):
-            Whether or not to train with normalized pixels (see Table 3 in the paper). Using normalized pixels improved
-            representation quality in the experiments of the authors.
 
     Example:
 
@@ -98,6 +97,11 @@ class VideoMAEConfig(PretrainedConfig):
 
     def __init__(
         self,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        num_frames=16,
+        tubelet_size=2,
         hidden_size=768,
         num_hidden_layers=12,
         num_attention_heads=12,
@@ -107,22 +111,22 @@ def __init__(
         attention_probs_dropout_prob=0.0,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        is_encoder_decoder=False,
-        image_size=224,
-        patch_size=16,
-        num_channels=3,
         qkv_bias=True,
         use_mean_pooling=True,
-        decoder_num_attention_heads=16,
-        decoder_hidden_size=512,
-        decoder_num_hidden_layers=8,
-        decoder_intermediate_size=2048,
-        mask_ratio=0.75,
-        norm_pix_loss=False,
+        decoder_num_attention_heads=6,
+        decoder_hidden_size=384,
+        decoder_num_hidden_layers=12,
+        decoder_intermediate_size=1536,
         **kwargs
     ):
         super().__init__(**kwargs)
 
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_frames = num_frames
+        self.tubelet_size = tubelet_size
+
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
@@ -132,14 +136,10 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
         self.qkv_bias = qkv_bias
         self.use_mean_pooling = use_mean_pooling
+
         self.decoder_num_attention_heads = decoder_num_attention_heads
         self.decoder_hidden_size = decoder_hidden_size
         self.decoder_num_hidden_layers = decoder_num_hidden_layers
         self.decoder_intermediate_size = decoder_intermediate_size
-        self.mask_ratio = mask_ratio
-        self.norm_pix_loss = norm_pix_loss
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 3530fe38a37b..bbcf94234d9c 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -19,13 +19,13 @@
 
 import torch
 
+from huggingface_hub import hf_hub_download
 from transformers import VideoMAEConfig, VideoMAEForVideoClassification
 
-from huggingface_hub import hf_hub_download
 
 def get_videomae_config(checkpoint_path):
     config = VideoMAEConfig()
-    
+
     if "large" in checkpoint_path:
         config.hidden_size = 1024
         config.intermediate_size = 4096
@@ -42,6 +42,7 @@ def get_videomae_config(checkpoint_path):
 
     return config
 
+
 def rename_key(name):
     if "cls_token" in name:
         name = name.replace("cls_token", "videomae.embeddings.cls_token")
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index f6745612212a..4fab43bf2575 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -17,6 +17,8 @@
 
 import collections.abc
 import math
+from copy import deepcopy
+from dataclasses import dataclass
 from typing import Optional, Set, Tuple, Union
 
 import numpy as np
@@ -29,7 +31,13 @@
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_videomae import VideoMAEConfig
 
 
@@ -44,6 +52,29 @@
 ]
 
 
+@dataclass
+class VideoMAEDecoderOutput(ModelOutput):
+    """
+    Class for VideoMAEDecoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
 # sin-cos position encoding
 # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
 def get_sinusoid_encoding_table(n_position, d_hid):
@@ -73,10 +104,12 @@ def __init__(self, config):
             patch_size=config.patch_size,
             num_channels=config.num_channels,
             embed_dim=config.hidden_size,
+            num_frames=config.num_frames,
+            tubelet_size=config.tubelet_size,
         )
-        num_patches = self.patch_embeddings.num_patches
+        self.num_patches = self.patch_embeddings.num_patches
         # fixed sin-cos embedding
-        self.position_embeddings = get_sinusoid_encoding_table(num_patches, config.hidden_size)
+        self.position_embeddings = get_sinusoid_encoding_table(self.num_patches, config.hidden_size)
         self.config = config
 
     def forward(self, pixel_values):
@@ -158,7 +191,7 @@ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
     def forward(
         self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
     ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
-        
+
         k_bias = torch.zeros_like(self.v_bias, requires_grad=False) if self.q_bias is not None else None
         keys = nn.functional.linear(input=hidden_states, weight=self.key.weight, bias=k_bias)
         values = nn.functional.linear(input=hidden_states, weight=self.value.weight, bias=self.v_bias)
@@ -466,7 +499,9 @@ def __init__(self, config):
         self.embeddings = VideoMAEEmbeddings(config)
         self.encoder = VideoMAEEncoder(config)
 
-        self.layernorm = nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm = (
+            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        )
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -551,6 +586,176 @@ def forward(
         )
 
 
+class VideoMAEDecoder(nn.Module):
+    def __init__(self, config, num_patches):
+        super().__init__()
+
+        decoder_num_labels = 3 * config.tubelet_size * config.patch_size**2
+
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = nn.ModuleList(
+            [VideoMAELayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
+        )
+
+        self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size)
+        self.decoder_pred = (
+            nn.Linear(config.decoder_hidden_size, decoder_num_labels) if decoder_num_labels > 0 else nn.Identity()
+        )
+
+        self.gradient_checkpointing = False
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        # apply Transformer layers (blocks)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    None,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, head_mask=None, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states = self.decoder_norm(hidden_states)
+
+        # predictor projection
+        logits = self.decoder_pred(hidden_states)
+
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+        return VideoMAEDecoderOutput(
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The VideoMAE Model transformer with the decoder on top for self-supervised pre-training.",
+    VIDEOMAE_START_DOCSTRING,
+)
+class VideoMAEForPreTraining(VideoMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.videomae = VideoMAEModel(config)
+
+        self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=False)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+        self.position_embeddings = get_sinusoid_encoding_table(
+            self.videomae.embeddings.num_patches, config.decoder_hidden_size
+        )
+
+        self.decoder = VideoMAEDecoder(config, num_patches=self.videomae.embeddings.num_patches)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values=None,
+        bool_masked_pos=None,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+        ```python
+        >>> from transformers import AutoFeatureExtractor, VideoMAEForPreTraining
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/vit-mae-base")
+        >>> model = VideoMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+        >>> mask = outputs.mask
+        >>> ids_restore = outputs.ids_restore
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.videomae(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        batch_size, seq_len, num_channels = sequence_output.shape
+
+        # we don't unshuffle the correct visible token order,
+        # but shuffle the pos embedding accordingly.
+        # TODO check for bool_masked_pos to be available
+        expanded_position_embeddings = self.position_embeddings.expand(batch_size, -1, -1).type_as(pixel_values)
+        expanded_position_embeddings = expanded_position_embeddings.to(pixel_values.device).clone().detach()
+        pos_emd_vis = expanded_position_embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
+        pos_emd_mask = expanded_position_embeddings[bool_masked_pos].reshape(batch_size, -1, num_channels)
+        x_full = torch.cat([sequence_output + pos_emd_vis, self.mask_token + pos_emd_mask], dim=1)  # [B, N, C_d]
+
+        decoder_outputs = self.decoder(x_full, pos_emd_mask.shape[1])  # [B, N_mask, 3 * 16 * 16]
+        logits = decoder_outputs.logits
+
+        # TODO compute loss
+        loss = None
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return BaseModelOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
 @add_start_docstrings(
     """VideoMAE Model transformer with a video classification head on top (a linear layer on top of the final hidden state of
     the [CLS] token) e.g. for ImageNet.""",
diff --git a/src/transformers/models/videomae/test.py b/src/transformers/models/videomae/test.py
index 0c589e8a8e18..9d3cfc160565 100644
--- a/src/transformers/models/videomae/test.py
+++ b/src/transformers/models/videomae/test.py
@@ -1,12 +1,13 @@
 import torch
 
-from transformers import VideoMAEConfig, VideoMAEModel
+from transformers import VideoMAEConfig, VideoMAEForPreTraining
 
 
-model = VideoMAEModel(VideoMAEConfig())
+model = VideoMAEForPreTraining(VideoMAEConfig())
 
 pixel_values = torch.randn(1, 3, 16, 224, 224)
+bool_masked_pos = torch.randint(0, 1, (1, 3 * 16 * 224 * 224)).bool()
 
-outputs = model(pixel_values)
+outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
 
 print(outputs.keys())

From ff28d7424584dafc87a9d756715323f043e55142 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 23 May 2022 09:17:58 +0200
Subject: [PATCH 05/42] Add VideoMAEFeatureExtractor

---
 docs/source/en/model_doc/videomae.mdx         |   7 +-
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/videomae/__init__.py  |  20 ++-
 .../videomae/convert_videomae_to_pytorch.py   |   2 +-
 .../videomae/feature_extraction_videomae.py   | 163 ++++++++++++++++++
 .../utils/dummy_vision_objects.py             |   7 +
 6 files changed, 196 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/models/videomae/feature_extraction_videomae.py

diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.mdx
index c36f40dd52f2..46f7040582f3 100644
--- a/docs/source/en/model_doc/videomae.mdx
+++ b/docs/source/en/model_doc/videomae.mdx
@@ -25,7 +25,7 @@ Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 
 
@@ -34,6 +34,11 @@ The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
 [[autodoc]] VideoMAEConfig
 
 
+## VideoMAEFeatureExtractor
+
+[[autodoc]] VideoMAEFeatureExtractor
+    - __call__
+
 ## VideoMAEModel
 
 [[autodoc]] VideoMAEModel
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 68ed1b3ca3eb..ba8ea1156839 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -654,6 +654,7 @@
     _import_structure["models.perceiver"].append("PerceiverFeatureExtractor")
     _import_structure["models.poolformer"].append("PoolFormerFeatureExtractor")
     _import_structure["models.segformer"].append("SegformerFeatureExtractor")
+    _import_structure["models.videomae"].append("VideoMAEFeatureExtractor")
     _import_structure["models.vilt"].append("ViltFeatureExtractor")
     _import_structure["models.vilt"].append("ViltProcessor")
     _import_structure["models.vit"].append("ViTFeatureExtractor")
@@ -3385,6 +3386,7 @@
         from .models.perceiver import PerceiverFeatureExtractor
         from .models.poolformer import PoolFormerFeatureExtractor
         from .models.segformer import SegformerFeatureExtractor
+        from .models.videomae import VideoMAEFeatureExtractor
         from .models.vilt import ViltFeatureExtractor, ViltProcessor
         from .models.vit import ViTFeatureExtractor
         from .models.yolos import YolosFeatureExtractor
diff --git a/src/transformers/models/videomae/__init__.py b/src/transformers/models/videomae/__init__.py
index 87ceabf002d6..fb239c6063ba 100644
--- a/src/transformers/models/videomae/__init__.py
+++ b/src/transformers/models/videomae/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
 
 
 _import_structure = {
@@ -33,12 +33,19 @@
     _import_structure["modeling_videomae"] = [
         "VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
         "VideoMAEForPreTraining",
-        "VideoMAELayer",
         "VideoMAEModel",
         "VideoMAEPreTrainedModel",
         "VideoMAEForVideoClassification",
     ]
 
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["feature_extraction_videomae"] = ["VideoMAEFeatureExtractor"]
+
 if TYPE_CHECKING:
     from .configuration_videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
 
@@ -52,11 +59,18 @@
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
             VideoMAEForVideoClassification,
-            VideoMAELayer,
             VideoMAEModel,
             VideoMAEPreTrainedModel,
         )
 
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .feature_extraction_videomae import VideoMAEFeatureExtractor
+
 else:
     import sys
 
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index bbcf94234d9c..125dce5bf81f 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Convert ViT MAE checkpoints from the original repository: https://github.com/facebookresearch/mae"""
+"""Convert VideoMAE checkpoints from the original repository: https://github.com/MCG-NJU/VideoMAE"""
 
 import argparse
 import json
diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
new file mode 100644
index 000000000000..7d2f1c8a8dce
--- /dev/null
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for VideoMAE."""
+
+from turtle import width
+from typing import Optional, Union
+
+import numpy as np
+from PIL import Image
+
+from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageFeatureExtractionMixin,
+    ImageInput,
+    is_torch_tensor,
+)
+from ...utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VideoMAEFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
+    r"""
+    Constructs a VideoMAE feature extractor.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
+            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
+            set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
+            if `do_resize` is set to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the input to a certain `size`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with mean and standard deviation.
+        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of means for each channel, to be used when normalizing images.
+        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+            The sequence of standard deviations for each channel, to be used when normalizing images.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize=True,
+        size=224,
+        resample=Image.BILINEAR,
+        do_center_crop=True,
+        do_normalize=True,
+        image_mean=None,
+        image_std=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize_video(self, video, size, resample='bilinear'):
+        return [self.resize(frame, size, resample) for frame in video]
+
+    def crop_video(self, video, size):
+        return [self.center_crop(frame, size) for frame in video]
+
+    def normalize_video(self, video, mean, std):
+        return [self.normalize(frame, mean, std) for frame in video]
+    
+    def __call__(
+        self, videos: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several video(s).
+
+        <Tip warning={true}>
+
+        NumPy arrays are converted to PIL images when resizing, so the most efficient is to pass PIL images.
+
+        </Tip>
+
+        Args:
+            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`):
+                The video or batch of videos to be prepared. Each video should be a list of frames, which can be either
+                PIL images or NumPy arrays. In case of a NumPy array, each frame should be of shape (H, W, C), where H
+                and W are frame height and width, and C is a number of channels.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, num_frames,
+              height, width).
+        """
+        # Input type checking for clearer error
+        valid_videos = False
+        is_batched = False
+
+        # Check that videos have a valid type
+        if isinstance(videos, (list, tuple)):
+            if isinstance(videos[0], (Image.Image, np.ndarray)):
+                valid_videos = True
+            elif isinstance(videos[0], (list, tuple)) and isinstance(
+                videos[0][0], (Image.Image, np.ndarray)
+            ):
+                valid_videos = True
+                is_batched = True
+
+        if not valid_videos:
+            raise ValueError(
+                "Videos must of type `List[PIL.Image.Image]`, `List[np.ndarray]` (single example), "
+                "`List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]` (batch of examples)."
+            )
+
+        if not is_batched:
+            videos = [videos]
+
+        # transformations (resizing + center cropping + normalization)
+        if self.do_resize and self.size is not None:
+            videos = [self.resize_video(video, size=self.size, resample=self.resample) for video in videos]
+        if self.do_center_crop and self.size is not None:
+            videos = [self.crop_video(video, size=self.size) for video in videos]
+        if self.do_normalize:
+            videos = [self.normalize_video(image=video, mean=self.image_mean, std=self.image_std) for video in videos]
+
+        # return as BatchFeature
+        data = {"pixel_values": videos}
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        return encoded_inputs
\ No newline at end of file
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index e5d2bced9e04..30228e022222 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -150,6 +150,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class VideoMAEFeatureExtractor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class ViltFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 

From ef164514fa72fddc3615659eaa83c9edaf20eeea Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 23 May 2022 09:28:23 +0200
Subject: [PATCH 06/42] Improve VideoMAEFeatureExtractor

---
 .../videomae/feature_extraction_videomae.py   | 21 +++++----------
 src/transformers/models/videomae/test.py      | 27 ++++++++++++++-----
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index 7d2f1c8a8dce..e6a47a44720f 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -14,20 +14,13 @@
 # limitations under the License.
 """Feature extractor class for VideoMAE."""
 
-from turtle import width
 from typing import Optional, Union
 
 import numpy as np
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    ImageFeatureExtractionMixin,
-    ImageInput,
-    is_torch_tensor,
-)
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageFeatureExtractionMixin, ImageInput
 from ...utils import TensorType, logging
 
 
@@ -84,7 +77,7 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
-    def resize_video(self, video, size, resample='bilinear'):
+    def resize_video(self, video, size, resample="bilinear"):
         return [self.resize(frame, size, resample) for frame in video]
 
     def crop_video(self, video, size):
@@ -92,7 +85,7 @@ def crop_video(self, video, size):
 
     def normalize_video(self, video, mean, std):
         return [self.normalize(frame, mean, std) for frame in video]
-    
+
     def __call__(
         self, videos: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
     ) -> BatchFeature:
@@ -133,9 +126,7 @@ def __call__(
         if isinstance(videos, (list, tuple)):
             if isinstance(videos[0], (Image.Image, np.ndarray)):
                 valid_videos = True
-            elif isinstance(videos[0], (list, tuple)) and isinstance(
-                videos[0][0], (Image.Image, np.ndarray)
-            ):
+            elif isinstance(videos[0], (list, tuple)) and isinstance(videos[0][0], (Image.Image, np.ndarray)):
                 valid_videos = True
                 is_batched = True
 
@@ -154,10 +145,10 @@ def __call__(
         if self.do_center_crop and self.size is not None:
             videos = [self.crop_video(video, size=self.size) for video in videos]
         if self.do_normalize:
-            videos = [self.normalize_video(image=video, mean=self.image_mean, std=self.image_std) for video in videos]
+            videos = [self.normalize_video(video, mean=self.image_mean, std=self.image_std) for video in videos]
 
         # return as BatchFeature
         data = {"pixel_values": videos}
         encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
 
-        return encoded_inputs
\ No newline at end of file
+        return encoded_inputs
diff --git a/src/transformers/models/videomae/test.py b/src/transformers/models/videomae/test.py
index 9d3cfc160565..5a95ad9764a1 100644
--- a/src/transformers/models/videomae/test.py
+++ b/src/transformers/models/videomae/test.py
@@ -1,13 +1,26 @@
-import torch
+# import torch
+import numpy as np
 
-from transformers import VideoMAEConfig, VideoMAEForPreTraining
+# from transformers import VideoMAEConfig, VideoMAEForPreTraining
+from transformers import VideoMAEFeatureExtractor
 
 
-model = VideoMAEForPreTraining(VideoMAEConfig())
+# model = VideoMAEForPreTraining(VideoMAEConfig())
 
-pixel_values = torch.randn(1, 3, 16, 224, 224)
-bool_masked_pos = torch.randint(0, 1, (1, 3 * 16 * 224 * 224)).bool()
+# pixel_values = torch.randn(1, 3, 16, 224, 224)
+# bool_masked_pos = torch.randint(0, 1, (1, 3 * 16 * 224 * 224)).bool()
 
-outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
+# outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
 
-print(outputs.keys())
+# print(outputs.keys())
+
+feature_extractor = VideoMAEFeatureExtractor()
+
+video = [np.random.rand(512, 640, 3), np.random.rand(312, 200, 3)]
+
+video = np.random.rand(16, 360, 640, 3)
+video = [video[i] for i in range(video.shape[0])]
+
+encoding = feature_extractor(video, return_tensors="pt")
+
+print(encoding.pixel_values.shape)

From 27bfe2b8cd6d6ff196b64f4ad0d7eafc96d8cc8d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 23 May 2022 10:48:44 +0200
Subject: [PATCH 07/42] Improve docs

---
 docs/source/en/model_doc/videomae.mdx                  | 10 +++++-----
 .../models/videomae/convert_videomae_to_pytorch.py     |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.mdx
index 46f7040582f3..72ec84202105 100644
--- a/docs/source/en/model_doc/videomae.mdx
+++ b/docs/source/en/model_doc/videomae.mdx
@@ -14,19 +14,19 @@ specific language governing permissions and limitations under the License.
 
 ## Overview
 
-The VideoMAE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
-<INSERT SHORT SUMMARY HERE>
+The VideoMAE model was proposed in [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+VideoMAE extends masked auto encoders ([MAE](vit_mae)) to video, claiming state-of-the-art performance on several video classification benchmarks.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*Pre-training video transformers on extra large-scale datasets is generally required to achieve premier performance on relatively small datasets. In this paper, we show that video masked autoencoders (VideoMAE) are data-efficient learners for self-supervised video pre-training (SSVP). We are inspired by the recent ImageMAE and propose customized video tube masking and reconstruction. These simple designs turn out to be effective for overcoming information leakage caused by the temporal correlation during video reconstruction. We obtain three important findings on SSVP: (1) An extremely high proportion of masking ratio (i.e., 90% to 95%) still yields favorable performance of VideoMAE. The temporally redundant video content enables higher masking ratio than that of images. (2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. This is partially ascribed to the challenging task of video reconstruction to enforce high-level structure learning. (3) VideoMAE shows that data quality is more important than data quantity for SSVP. Domain shift between pre-training and target datasets are important issues in SSVP. Notably, our VideoMAE with the vanilla ViT backbone can achieve 83.9% on Kinects-400, 75.3% on Something-Something V2, 90.8% on UCF101, and 61.1% on HMDB51 without using any extra data.*
 
 Tips:
 
-<INSERT TIPS ABOUT MODEL HERE>
+- One can use [`VideoMAEFeatureExtractor`] to prepare videos for the model.
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+The original code can be found [here](https://github.com/MCG-NJU/VideoMAE).
 
 
 ## VideoMAEConfig
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 125dce5bf81f..27e07e3a851d 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -170,8 +170,8 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_
 
     if push_to_hub:
         print("Pushing to the hub...")
-        model_name = "nielsr/videomae-base"
-        model.push_to_hub(model_name, organization="hustvl")
+        model_name = "videomae-base"
+        model.push_to_hub(model_name, organization="nielsr")
 
 
 if __name__ == "__main__":

From 22e18ab349046437e016e15b1befacc3692f0608 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 23 May 2022 16:08:22 +0200
Subject: [PATCH 08/42] Add first draft of model tests

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/auto/__init__.py      |   2 +
 src/transformers/models/videomae/test.py      |  10 +-
 .../models/videomae/test_model.py             |  12 +
 src/transformers/utils/dummy_pt_objects.py    |   3 +
 .../models/videomae/test_modeling_videomae.py | 208 ++----------------
 tests/test_modeling_common.py                 |   2 +
 7 files changed, 44 insertions(+), 195 deletions(-)
 create mode 100644 src/transformers/models/videomae/test_model.py

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ba8ea1156839..26c86ae6a81e 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -801,6 +801,7 @@
             "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
             "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
             "MODEL_FOR_VISION_2_SEQ_MAPPING",
             "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
             "MODEL_MAPPING",
@@ -3511,6 +3512,7 @@
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
             MODEL_MAPPING,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 09961bae14fd..4706d90db0c4 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -63,6 +63,7 @@
         "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING",
         "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
         "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
         "MODEL_FOR_VISION_2_SEQ_MAPPING",
         "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING",
         "MODEL_MAPPING",
@@ -203,6 +204,7 @@
             MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING,
             MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
             MODEL_FOR_VISION_2_SEQ_MAPPING,
             MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING,
             MODEL_MAPPING,
diff --git a/src/transformers/models/videomae/test.py b/src/transformers/models/videomae/test.py
index 5a95ad9764a1..e4c0fc91f77e 100644
--- a/src/transformers/models/videomae/test.py
+++ b/src/transformers/models/videomae/test.py
@@ -1,18 +1,10 @@
 # import torch
 import numpy as np
 
-# from transformers import VideoMAEConfig, VideoMAEForPreTraining
 from transformers import VideoMAEFeatureExtractor
 
 
-# model = VideoMAEForPreTraining(VideoMAEConfig())
-
-# pixel_values = torch.randn(1, 3, 16, 224, 224)
-# bool_masked_pos = torch.randint(0, 1, (1, 3 * 16 * 224 * 224)).bool()
-
-# outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-
-# print(outputs.keys())
+# test feature extractor
 
 feature_extractor = VideoMAEFeatureExtractor()
 
diff --git a/src/transformers/models/videomae/test_model.py b/src/transformers/models/videomae/test_model.py
new file mode 100644
index 000000000000..2600790df1d0
--- /dev/null
+++ b/src/transformers/models/videomae/test_model.py
@@ -0,0 +1,12 @@
+from transformers import VideoMAEConfig, VideoMAEModel
+import torch
+
+## test model
+
+model = VideoMAEModel(VideoMAEConfig())
+
+pixel_values = torch.randn(1, 3, 16, 224, 224)
+
+outputs = model(pixel_values)
+
+print(outputs.keys())
\ No newline at end of file
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index d994431a6c6c..a9eed7566e5f 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -406,6 +406,9 @@ def load_tf_weights_in_albert(*args, **kwargs):
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
+MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = None
+
+
 MODEL_FOR_VISION_2_SEQ_MAPPING = None
 
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index b1c4792b9a6d..9b1ada72669c 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -16,7 +16,6 @@
 
 
 import inspect
-import math
 import tempfile
 import unittest
 
@@ -35,13 +34,13 @@
     from torch import nn
 
     from transformers import VideoMAEForPreTraining, VideoMAEForVideoClassification, VideoMAEModel
-    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST, to_2tuple
+    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 if is_vision_available():
     from PIL import Image
 
-    from transformers import ViTFeatureExtractor
+    from transformers import VideoMAEFeatureExtractor
 
 
 class VideoMAEModelTester:
@@ -50,8 +49,10 @@ def __init__(
         parent,
         batch_size=13,
         image_size=30,
-        patch_size=2,
         num_channels=3,
+        patch_size=2,
+        tubelet_size=2,
+        num_frames=16,
         is_training=True,
         use_labels=True,
         hidden_size=32,
@@ -69,8 +70,10 @@ def __init__(
         self.parent = parent
         self.batch_size = batch_size
         self.image_size = image_size
-        self.patch_size = patch_size
         self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.tubelet_size = tubelet_size
+        self.num_frames = num_frames
         self.is_training = is_training
         self.use_labels = use_labels
         self.hidden_size = hidden_size
@@ -84,8 +87,14 @@ def __init__(
         self.initializer_range = initializer_range
         self.scope = scope
 
+        # in VideoMAE, the number of tokens equals num_frames/2 * num_patches
+        num_patches = (image_size // patch_size) ** 2
+        self.seq_length = (num_frames // 2) * num_patches
+
     def prepare_config_and_inputs(self):
-        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_channels, self.num_frames, self.image_size, self.image_size]
+        )
 
         labels = None
         if self.use_labels:
@@ -100,6 +109,8 @@ def get_config(self):
             image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
+            num_frames=self.num_frames,
+            tubelet_size=self.tubelet_size,
             hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
@@ -116,26 +127,15 @@ def create_and_check_model(self, config, pixel_values, labels):
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        # expected sequence length = (num_patches + 1) * (1 - config.mask_ratio), rounded above
-        # (we add 1 for the [CLS] token)
-        image_size = to_2tuple(self.image_size)
-        patch_size = to_2tuple(self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        expected_seq_len = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, expected_seq_len, self.hidden_size))
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model = VideoMAEForPreTraining(config)
         model.to(torch_device)
         model.eval()
         result = model(pixel_values)
-        # expected sequence length = num_patches
-        image_size = to_2tuple(self.image_size)
-        patch_size = to_2tuple(self.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        expected_seq_len = num_patches
-        expected_num_channels = self.patch_size**2 * self.num_channels
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, expected_seq_len, expected_num_channels))
+        print(result)
+        assert False
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -200,143 +200,6 @@ def test_for_pretraining(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
 
-    def test_attention_outputs(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        config.return_dict = True
-
-        # in VideoMAE, the seq_len equals (number of patches + 1) * (1 - mask_ratio), rounded above
-        image_size = to_2tuple(self.model_tester.image_size)
-        patch_size = to_2tuple(self.model_tester.patch_size)
-        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-        seq_len = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))
-        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
-        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
-        chunk_length = getattr(self.model_tester, "chunk_length", None)
-        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
-            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = False
-            config.return_dict = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            # check that output_attentions also work using config
-            del inputs_dict["output_attentions"]
-            config.output_attentions = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
-
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-            out_len = len(outputs)
-
-            # Check attention is always last and order is fine
-            inputs_dict["output_attentions"] = True
-            inputs_dict["output_hidden_states"] = True
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            if hasattr(self.model_tester, "num_hidden_states_types"):
-                added_hidden_states = self.model_tester.num_hidden_states_types
-            elif self.is_encoder_decoder:
-                added_hidden_states = 2
-            else:
-                added_hidden_states = 1
-            self.assertEqual(out_len + added_hidden_states, len(outputs))
-
-            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
-
-            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
-            if chunk_length is not None:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-4:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
-                )
-            else:
-                self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
-                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
-                )
-
-    def test_hidden_states_output(self):
-        def check_hidden_states_output(inputs_dict, config, model_class):
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
-            self.assertEqual(len(hidden_states), expected_num_layers)
-
-            # VideoMAE has a different seq_length
-            image_size = to_2tuple(self.model_tester.image_size)
-            patch_size = to_2tuple(self.model_tester.patch_size)
-            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
-            seq_length = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))
-
-            self.assertListEqual(
-                list(hidden_states[0].shape[-2:]),
-                [seq_length, self.model_tester.hidden_size],
-            )
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            inputs_dict["output_hidden_states"] = True
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-            # check that output_hidden_states also work using config
-            del inputs_dict["output_hidden_states"]
-            config.output_hidden_states = True
-
-            check_hidden_states_output(inputs_dict, config, model_class)
-
-    # overwrite from common since VideoMAEForPretraining has random masking, we need to fix the noise
-    # to generate masks during test
-    def check_pt_tf_models(self, tf_model, pt_model, pt_inputs_dict):
-
-        # make masks reproducible
-        np.random.seed(2)
-
-        num_patches = int((pt_model.config.image_size // pt_model.config.patch_size) ** 2)
-        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
-        pt_noise = torch.from_numpy(noise)
-
-        # Add `noise` argument.
-        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
-        pt_inputs_dict["noise"] = pt_noise
-
-        super().check_pt_tf_models(tf_model, pt_model, pt_inputs_dict)
-
     def test_save_load(self):
 
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -411,35 +274,8 @@ def prepare_img():
 class VideoMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return ViTFeatureExtractor.from_pretrained("nanjing/videomae-base") if is_vision_available() else None
+        return VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base") if is_vision_available() else None
 
     @slow
     def test_inference_for_pretraining(self):
-        # make random mask reproducible across the PT and TF model
-        np.random.seed(2)
-
-        model = VideoMAEForPreTraining.from_pretrained("nanjing/videomae-base").to(torch_device)
-
-        feature_extractor = self.default_feature_extractor
-        image = prepare_img()
-        inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
-
-        # prepare a noise vector that will be also used for testing the TF model
-        # (this way we can ensure that the PT and TF models operate on the same inputs)
-        videomae_config = VideoMAEConfig()
-        num_patches = int((videomae_config.image_size // videomae_config.patch_size) ** 2)
-        noise = np.random.uniform(size=(1, num_patches))
-
-        # forward pass
-        with torch.no_grad():
-            outputs = model(**inputs, noise=torch.from_numpy(noise).to(device=torch_device))
-
-        # verify the logits
-        expected_shape = torch.Size((1, 196, 768))
-        self.assertEqual(outputs.logits.shape, expected_shape)
-
-        expected_slice = torch.tensor(
-            [[-0.0548, -1.7023, -0.9325], [0.3721, -0.5670, -0.2233], [0.8235, -1.3878, -0.3524]]
-        )
-
-        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice.to(torch_device), atol=1e-4))
+        raise NotImplementedError("To do")
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index a86b33e88ff2..c05771336e63 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -99,6 +99,7 @@
         MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
         MODEL_MAPPING,
         AdaptiveEmbedding,
         AutoModelForCausalLM,
@@ -182,6 +183,7 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 *get_values(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING),
                 *get_values(MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING),
                 *get_values(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING),
+                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
             ]:
                 inputs_dict["labels"] = torch.zeros(
                     self.model_tester.batch_size, dtype=torch.long, device=torch_device

From d8a1aa552490d25763152e213d4513c011ef48af Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 24 May 2022 09:47:31 +0200
Subject: [PATCH 09/42] Improve VideoMAEForPreTraining

---
 .../models/videomae/modeling_videomae.py      | 52 ++++++++++++++++---
 .../models/videomae/test_model.py             | 52 +++++++++++++++++--
 2 files changed, 91 insertions(+), 13 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 4fab43bf2575..f9f7546e1b82 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch ViT MAE (masked autoencoder) model."""
+""" PyTorch VideoMAE (masked autoencoder) model."""
 
 
 import collections.abc
@@ -75,6 +75,32 @@ class VideoMAEDecoderOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class VideoMAEForPreTrainingOutput(ModelOutput):
+    """
+    Class for VideoMAEForPreTraining's outputs, with potential hidden states and attentions.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
 # sin-cos position encoding
 # https://github.com/jadore801120/attention-is-all-you-need-pytorch/blob/master/transformer/Models.py#L31
 def get_sinusoid_encoding_table(n_position, d_hid):
@@ -112,13 +138,20 @@ def __init__(self, config):
         self.position_embeddings = get_sinusoid_encoding_table(self.num_patches, config.hidden_size)
         self.config = config
 
-    def forward(self, pixel_values):
+    def forward(self, pixel_values, bool_masked_pos):
         # create patch embeddings
         embeddings = self.patch_embeddings(pixel_values)
 
         # add position embeddings
         embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
 
+        # only keep visible patches
+        if bool_masked_pos is not None:
+            batch_size, _, num_channels = embeddings.shape
+            embeddings = embeddings[~bool_masked_pos].reshape(
+                batch_size, -1, num_channels
+            )  # ~bool_masked_pos means visible
+
         return embeddings
 
 
@@ -522,7 +555,7 @@ class PreTrainedModel
     def forward(
         self,
         pixel_values=None,
-        noise=None,
+        bool_masked_pos=None,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -564,7 +597,7 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output = self.embeddings(pixel_values)
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos)
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -684,11 +717,11 @@ def __init__(self, config):
         self.post_init()
 
     @add_start_docstrings_to_model_forward(VIDEOMAE_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=VideoMAEForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values=None,
-        bool_masked_pos=None,
+        pixel_values,
+        bool_masked_pos,
         head_mask=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -720,6 +753,7 @@ def forward(
 
         outputs = self.videomae(
             pixel_values,
+            bool_masked_pos=bool_masked_pos,
             head_mask=head_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
@@ -727,6 +761,7 @@ def forward(
         )
 
         sequence_output = outputs[0]
+        sequence_output = self.encoder_to_decoder(sequence_output)  # [B, N_vis, C_d]
         batch_size, seq_len, num_channels = sequence_output.shape
 
         # we don't unshuffle the correct visible token order,
@@ -736,6 +771,7 @@ def forward(
         expanded_position_embeddings = expanded_position_embeddings.to(pixel_values.device).clone().detach()
         pos_emd_vis = expanded_position_embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
         pos_emd_mask = expanded_position_embeddings[bool_masked_pos].reshape(batch_size, -1, num_channels)
+
         x_full = torch.cat([sequence_output + pos_emd_vis, self.mask_token + pos_emd_mask], dim=1)  # [B, N, C_d]
 
         decoder_outputs = self.decoder(x_full, pos_emd_mask.shape[1])  # [B, N_mask, 3 * 16 * 16]
@@ -748,7 +784,7 @@ def forward(
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
-        return BaseModelOutput(
+        return VideoMAEForPreTrainingOutput(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
diff --git a/src/transformers/models/videomae/test_model.py b/src/transformers/models/videomae/test_model.py
index 2600790df1d0..cc17e7805474 100644
--- a/src/transformers/models/videomae/test_model.py
+++ b/src/transformers/models/videomae/test_model.py
@@ -1,12 +1,54 @@
-from transformers import VideoMAEConfig, VideoMAEModel
+import numpy as np
 import torch
 
-## test model
+from transformers import VideoMAEConfig, VideoMAEForPreTraining
 
-model = VideoMAEModel(VideoMAEConfig())
+
+class TubeMaskingGenerator:
+    def __init__(self, input_size, mask_ratio):
+        self.frames, self.height, self.width = input_size
+        self.num_patches_per_frame = self.height * self.width
+        self.total_patches = self.frames * self.num_patches_per_frame
+        self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame)
+        self.total_masks = self.frames * self.num_masks_per_frame
+
+    def __repr__(self):
+        repr_str = "Maks: total patches {}, mask patches {}".format(self.total_patches, self.total_masks)
+        return repr_str
+
+    def __call__(self):
+        mask_per_frame = np.hstack(
+            [
+                np.zeros(self.num_patches_per_frame - self.num_masks_per_frame),
+                np.ones(self.num_masks_per_frame),
+            ]
+        )
+        np.random.shuffle(mask_per_frame)
+        mask = np.tile(mask_per_frame, (self.frames, 1)).flatten()
+        return mask
+
+
+num_frames = 16
+input_size = 224
+patch_size = (16, 16)
+window_size = (num_frames // 2, input_size // patch_size[0], input_size // patch_size[1])
+
+masked_position_generator = TubeMaskingGenerator(input_size=window_size, mask_ratio=0.9)
+
+
+# test model
+
+model = VideoMAEForPreTraining(VideoMAEConfig())
 
 pixel_values = torch.randn(1, 3, 16, 224, 224)
 
-outputs = model(pixel_values)
+bool_masked_pos = masked_position_generator()
+print("Shape of bool masked pos:", bool_masked_pos.shape)
+
+bool_masked_pos = torch.from_numpy(bool_masked_pos)
+bool_masked_pos = bool_masked_pos.unsqueeze(0)
+bool_masked_pos = bool_masked_pos.flatten(1).to(torch.bool)
+
+outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
 
-print(outputs.keys())
\ No newline at end of file
+print(outputs.logits.shape)

From 2caaee952479c58fcb2de49f9fbc6fbb6a10e4fc Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 24 May 2022 09:56:46 +0200
Subject: [PATCH 10/42] Fix base_model_prefix

---
 src/transformers/models/videomae/modeling_videomae.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index f9f7546e1b82..6f641bc0d477 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -24,7 +24,7 @@
 import numpy as np
 import torch
 import torch.utils.checkpoint
-from torch import nn
+from torch import embedding, nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
@@ -185,9 +185,8 @@ def forward(self, pixel_values):
             raise ValueError(
                 f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
             )
-        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
-        print("Shape of embeddings:", x.shape)
-        return x
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
 
 
 class VideoMAESelfAttention(nn.Module):
@@ -456,7 +455,6 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.vit_mae.modeling_vit_mae.ViTMAEPreTrainedModel with ViTMAE->VideoMAE
 class VideoMAEPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -464,7 +462,7 @@ class VideoMAEPreTrainedModel(PreTrainedModel):
     """
 
     config_class = VideoMAEConfig
-    base_model_prefix = "vit"
+    base_model_prefix = "videomae"
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
 

From e321b891eea47001501bc26c82646f89b65c1793 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 24 May 2022 10:17:36 +0200
Subject: [PATCH 11/42] Make model take pixel_values of shape (B, T, C, H, W)

---
 .../models/videomae/convert_videomae_to_pytorch.py    |  1 +
 src/transformers/models/videomae/modeling_videomae.py | 11 ++++++-----
 tests/models/videomae/test_modeling_videomae.py       |  2 +-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 27e07e3a851d..b5357847a7ab 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -140,6 +140,7 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_
 
     # forward pass
     pixel_values = torch.load("/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/eating_spaghetti_video.pt")
+    pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
     outputs = model(pixel_values)
     logits = outputs.logits
 
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 6f641bc0d477..4c6265cc7810 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -24,7 +24,7 @@
 import numpy as np
 import torch
 import torch.utils.checkpoint
-from torch import embedding, nn
+from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
@@ -146,11 +146,10 @@ def forward(self, pixel_values, bool_masked_pos):
         embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
 
         # only keep visible patches
+        # ~bool_masked_pos means visible
         if bool_masked_pos is not None:
             batch_size, _, num_channels = embeddings.shape
-            embeddings = embeddings[~bool_masked_pos].reshape(
-                batch_size, -1, num_channels
-            )  # ~bool_masked_pos means visible
+            embeddings = embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
 
         return embeddings
 
@@ -180,11 +179,13 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768,
         )
 
     def forward(self, pixel_values):
-        batch_size, num_channels, time, height, width = pixel_values.shape
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
         if height != self.image_size[0] or width != self.image_size[1]:
             raise ValueError(
                 f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
             )
+        # permute to (batch_size, num_channels, num_frames, height, width)
+        pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
         embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
         return embeddings
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 9b1ada72669c..a2492cd3ead6 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -89,7 +89,7 @@ def __init__(
 
         # in VideoMAE, the number of tokens equals num_frames/2 * num_patches
         num_patches = (image_size // patch_size) ** 2
-        self.seq_length = (num_frames // 2) * num_patches
+        self.seq_length = (num_frames // tubelet_size) * num_patches
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor(

From 7c84302a534940f4cbb2cb932f8bc38166460cf0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 24 May 2022 11:34:02 +0200
Subject: [PATCH 12/42] Add loss computation of VideoMAEForPreTraining

---
 .../models/videomae/configuration_videomae.py |   4 +
 .../models/videomae/modeling_videomae.py      | 106 +++++++++++++++---
 .../models/videomae/test_model.py             |   5 +-
 3 files changed, 96 insertions(+), 19 deletions(-)

diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index d3450c2ce06f..3ded151570a7 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -78,6 +78,8 @@ class VideoMAEConfig(PretrainedConfig):
             Number of hidden layers in the decoder.
         decoder_intermediate_size (`int`, *optional*, defaults to 1536):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        norm_pix_loss (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the target patch pixels.
 
     Example:
 
@@ -117,6 +119,7 @@ def __init__(
         decoder_hidden_size=384,
         decoder_num_hidden_layers=12,
         decoder_intermediate_size=1536,
+        norm_pix_loss=True,
         **kwargs
     ):
         super().__init__(**kwargs)
@@ -143,3 +146,4 @@ def __init__(
         self.decoder_hidden_size = decoder_hidden_size
         self.decoder_num_hidden_layers = decoder_num_hidden_layers
         self.decoder_intermediate_size = decoder_intermediate_size
+        self.norm_pix_loss = norm_pix_loss
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 4c6265cc7810..bf2d0c159046 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 Multimedia Computing Group, Nanjing University and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
+from ...image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
@@ -156,7 +157,11 @@ def forward(self, pixel_values, bool_masked_pos):
 
 class PatchEmbeddings(nn.Module):
     """
-    Video to Patch Embedding.
+    Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
+    height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.
+
+    The seq_len (the number of patches) equals (number of frames // tubelet_size) * (height // patch_size) * (width //
+    patch_size).
 
     """
 
@@ -498,9 +503,9 @@ def _set_gradient_checkpointing(self, module, value=False):
 
 VIDEOMAE_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`VideoMAEFeatureExtractor`]. See
+            [`VideoMAEFeatureExtractor.__call__`] for details.
 
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
@@ -553,7 +558,7 @@ class PreTrainedModel
     @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
-        pixel_values=None,
+        pixel_values,
         bool_masked_pos=None,
         head_mask=None,
         output_attentions=None,
@@ -566,14 +571,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoFeatureExtractor, VideoMAEModel
+        >>> from transformers import VideoMAEFeatureExtractor, VideoMAEModel
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("nanjing/videomae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base")
         >>> model = VideoMAEModel.from_pretrained("nanjing/videomae-base")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
@@ -586,9 +591,6 @@ def forward(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -644,6 +646,7 @@ def __init__(self, config, num_patches):
     def forward(
         self,
         hidden_states,
+        return_token_num,
         output_attentions=False,
         output_hidden_states=False,
         return_dict=True,
@@ -679,9 +682,11 @@ def custom_forward(*inputs):
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_states,)
 
-        hidden_states = self.decoder_norm(hidden_states)
+        if return_token_num > 0:
+            hidden_states = hidden_states[:, -return_token_num:]
 
         # predictor projection
+        hidden_states = self.decoder_norm(hidden_states)
         logits = self.decoder_pred(hidden_states)
 
         if not return_dict:
@@ -731,15 +736,15 @@ def forward(
 
         Examples:
         ```python
-        >>> from transformers import AutoFeatureExtractor, VideoMAEForPreTraining
+        >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForPreTraining
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/vit-mae-base")
-        >>> model = VideoMAEForPreTraining.from_pretrained("facebook/vit-mae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/vit-mae-base")
+        >>> model = VideoMAEForPreTraining.from_pretrained("nanjing/vit-mae-base")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
 
@@ -777,7 +782,74 @@ def forward(
         logits = decoder_outputs.logits
 
         # TODO compute loss
+        # TODO check correct format of videos! (B, T, C, H, W)
         loss = None
+        with torch.no_grad():
+            # calculate the labels to be predicted
+            # first, unnormalize the frames
+            device = pixel_values.device
+            mean = torch.as_tensor(IMAGENET_DEFAULT_MEAN).to(device)[None, None, :, None, None]
+            std = torch.as_tensor(IMAGENET_DEFAULT_STD).to(device)[None, None, :, None, None]
+            frames = pixel_values * std + mean  # in [0, 1]
+
+            batch_size, time, num_channels, height, width = frames.shape
+            tubelet_size, patch_size = self.config.tubelet_size, self.config.patch_size
+            if self.config.norm_pix_loss:
+                # step 1: split up dimensions (time by tubelet_size, height by patch_size, width by patch_size)
+                frames = frames.view(
+                    batch_size,
+                    time // tubelet_size,
+                    tubelet_size,
+                    num_channels,
+                    height // patch_size,
+                    patch_size,
+                    width // patch_size,
+                    patch_size,
+                )
+                # step 2: move dimensions to concatenate: (batch_size, T//ts, H//ps, W//ps, ts, ps, ps, C)
+                frames = frames.permute(0, 1, 4, 6, 2, 5, 7, 3).contiguous()
+                # step 3: concatenate: (batch_size, T//ts, H//bs, W//bs, ts*bs*bs, C)
+                frames = frames.view(
+                    batch_size,
+                    time // tubelet_size * height // patch_size * width // patch_size,
+                    tubelet_size * patch_size * patch_size,
+                    num_channels,
+                )
+                # step 4: normalize. The authors find that the mean is about 0.48 and standard deviation is about 0.08.
+                frames_norm = (frames - frames.mean(dim=-2, keepdim=True)) / (
+                    frames.var(dim=-2, unbiased=True, keepdim=True).sqrt() + 1e-6
+                )
+                # step 5: reshape to (batch_size, T//ts * H//ps * W//ps, ts * ps * ps * C)
+                videos_patch = frames_norm.view(
+                    batch_size,
+                    time // tubelet_size * height // patch_size * width // patch_size,
+                    tubelet_size * patch_size * patch_size * num_channels,
+                )
+            else:
+                # step 1: split up dimensions (time by tubelet_size, height by patch_size, width by patch_size)
+                frames = frames.view(
+                    batch_size,
+                    time // tubelet_size,
+                    tubelet_size,
+                    num_channels,
+                    height // patch_size,
+                    patch_size,
+                    width // patch_size,
+                    patch_size,
+                )
+                # step 2: move dimensions to concatenate: (batch_size, T//ts, H//ps, W//ps, ts, ps, ps, C)
+                frames = frames.permute(0, 1, 4, 6, 2, 5, 7, 3).contiguous()
+                # step 3: concatenate
+                videos_patch = frames.view(
+                    batch_size,
+                    time // tubelet_size * height // patch_size * width // patch_size,
+                    tubelet_size * patch_size * patch_size * num_channels,
+                )
+
+            batch_size, _, num_channels = videos_patch.shape
+            labels = videos_patch[bool_masked_pos].reshape(batch_size, -1, num_channels)
+            loss_fct = MSELoss()
+            loss = loss_fct(logits, labels)
 
         if not return_dict:
             output = (logits,) + outputs[2:]
@@ -832,14 +904,14 @@ def forward(
         Examples:
 
         ```python
-        >>> from transformers import AutoFeatureExtractor, VideoMAEForVideoClassification
+        >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
         >>> from PIL import Image
         >>> import requests
 
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("nanjing/videomae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base")
         >>> model = VideoMAEForVideoClassification.from_pretrained("nanjing/videomae-base")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
diff --git a/src/transformers/models/videomae/test_model.py b/src/transformers/models/videomae/test_model.py
index cc17e7805474..e47a8a760fa8 100644
--- a/src/transformers/models/videomae/test_model.py
+++ b/src/transformers/models/videomae/test_model.py
@@ -38,12 +38,13 @@ def __call__(self):
 
 # test model
 
-model = VideoMAEForPreTraining(VideoMAEConfig())
+model = VideoMAEForPreTraining(VideoMAEConfig(norm_pix_loss=True))
 
-pixel_values = torch.randn(1, 3, 16, 224, 224)
+pixel_values = torch.randn(1, 16, 3, 224, 224)
 
 bool_masked_pos = masked_position_generator()
 print("Shape of bool masked pos:", bool_masked_pos.shape)
+print("Number of masked frames:", np.sum(bool_masked_pos))
 
 bool_masked_pos = torch.from_numpy(bool_masked_pos)
 bool_masked_pos = bool_masked_pos.unsqueeze(0)

From ecdfe40f095f2eb3cd355a5403cca8cca596ed02 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 24 May 2022 12:15:11 +0200
Subject: [PATCH 13/42] Improve tests

---
 .../models/videomae/modeling_videomae.py      |  22 +++-
 .../models/videomae/test_modeling_videomae.py | 106 ++++++------------
 2 files changed, 53 insertions(+), 75 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index bf2d0c159046..dd4d1454a8d1 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -24,7 +24,7 @@
 import numpy as np
 import torch
 import torch.utils.checkpoint
-from torch import nn
+from torch import embedding, nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
@@ -48,7 +48,8 @@
 _CHECKPOINT_FOR_DOC = "nanjing/videomae-base"
 
 VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "nanjing/videomae-base",
+    # TODO rename to organization
+    "nielsr/videomae-base",
     # See all VideoMAE models at https://huggingface.co/models?filter=videomae
 ]
 
@@ -146,12 +147,19 @@ def forward(self, pixel_values, bool_masked_pos):
         # add position embeddings
         embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
 
+        print("Shape of embeddings:", embeddings.shape)
+
         # only keep visible patches
         # ~bool_masked_pos means visible
         if bool_masked_pos is not None:
             batch_size, _, num_channels = embeddings.shape
-            embeddings = embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
+            print("Shape of bool_masked_pos:", bool_masked_pos.shape)
+            embeddings = embeddings[~bool_masked_pos]
+            print("Shape of filtered embeddings:", embeddings.shape)
+            embeddings = embeddings.reshape(batch_size, -1, num_channels)
 
+        print("Shape of final embeddings:", embeddings.shape)
+        
         return embeddings
 
 
@@ -848,8 +856,12 @@ def forward(
 
             batch_size, _, num_channels = videos_patch.shape
             labels = videos_patch[bool_masked_pos].reshape(batch_size, -1, num_channels)
-            loss_fct = MSELoss()
-            loss = loss_fct(logits, labels)
+
+        print("Shape of logits:", logits.shape)
+        print("Shape of labels:", labels.shape)
+        
+        loss_fct = MSELoss()
+        loss = loss_fct(logits, labels)
 
         if not return_dict:
             output = (logits,) + outputs[2:]
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index a2492cd3ead6..9da9dbf5d6e8 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -15,13 +15,12 @@
 """ Testing suite for the PyTorch VideoMAE model. """
 
 
+import copy
 import inspect
-import tempfile
 import unittest
 
-import numpy as np
-
 from transformers import VideoMAEConfig
+from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
@@ -33,8 +32,13 @@
     import torch
     from torch import nn
 
-    from transformers import VideoMAEForPreTraining, VideoMAEForVideoClassification, VideoMAEModel
-    from transformers.models.vit.modeling_vit import VIT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers import (
+        MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
+        VideoMAEForPreTraining,
+        VideoMAEForVideoClassification,
+        VideoMAEModel,
+    )
+    from transformers.models.videomae.modeling_videomae import VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 if is_vision_available():
@@ -48,11 +52,11 @@ def __init__(
         self,
         parent,
         batch_size=13,
-        image_size=30,
+        image_size=10,
         num_channels=3,
         patch_size=2,
         tubelet_size=2,
-        num_frames=16,
+        num_frames=2,
         is_training=True,
         use_labels=True,
         hidden_size=32,
@@ -87,13 +91,13 @@ def __init__(
         self.initializer_range = initializer_range
         self.scope = scope
 
-        # in VideoMAE, the number of tokens equals num_frames/2 * num_patches
+        # in VideoMAE, the number of tokens equals num_frames/tubelet_size * num_patches
         num_patches = (image_size // patch_size) ** 2
         self.seq_length = (num_frames // tubelet_size) * num_patches
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor(
-            [self.batch_size, self.num_channels, self.num_frames, self.image_size, self.image_size]
+            [self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
         )
 
         labels = None
@@ -133,9 +137,9 @@ def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model = VideoMAEForPreTraining(config)
         model.to(torch_device)
         model.eval()
-        result = model(pixel_values)
-        print(result)
-        assert False
+        bool_masked_pos = torch.randint(0, 2, (self.batch_size, self.seq_length), dtype=torch.bool)
+        result = model(pixel_values, bool_masked_pos)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -164,11 +168,29 @@ def setUp(self):
         self.model_tester = VideoMAEModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VideoMAEConfig, has_text_modality=False, hidden_size=37)
 
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = copy.deepcopy(inputs_dict)
+
+        if model_class == VideoMAEForPreTraining:
+            inputs_dict["bool_masked_pos"] = torch.zeros(
+                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+            )
+        
+        if return_labels:
+            if model_class in [
+                *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
+            ]:
+                inputs_dict["labels"] = torch.zeros(
+                    self.model_tester.batch_size, dtype=torch.long, device=torch_device
+                )
+
+        return inputs_dict
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    @unittest.skip(reason="VideoMAE does not use inputs_embeds")
     def test_inputs_embeds(self):
-        # VideoMAE does not use inputs_embeds
         pass
 
     def test_model_common_attributes(self):
@@ -200,65 +222,9 @@ def test_for_pretraining(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_pretraining(*config_and_inputs)
 
-    def test_save_load(self):
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-            # make random mask reproducible
-            torch.manual_seed(2)
-            with torch.no_grad():
-                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-            out_2 = outputs[0].cpu().numpy()
-            out_2[np.isnan(out_2)] = 0
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname)
-                model.to(torch_device)
-                # make random mask reproducible
-                torch.manual_seed(2)
-                with torch.no_grad():
-                    after_outputs = model(**self._prepare_for_class(inputs_dict, model_class))
-
-                # Make sure we don't have nans
-                out_1 = after_outputs[0].cpu().numpy()
-                out_1[np.isnan(out_1)] = 0
-                max_diff = np.amax(np.abs(out_1 - out_2))
-                self.assertLessEqual(max_diff, 1e-5)
-
-    @unittest.skip(
-        reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_determinism(self):
-        pass
-
-    @unittest.skip(
-        reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_save_load_fast_init_from_base(self):
-        pass
-
-    @unittest.skip(
-        reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load
-    to get deterministic results."""
-    )
-    def test_save_load_fast_init_to_base(self):
-        pass
-
-    @unittest.skip(reason="""VideoMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
-    def test_model_outputs_equivalence(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
-        for model_name in VIT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+        for model_name in VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             model = VideoMAEModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 

From 971bf85ec413bd19577a787d669d2d1d9704bea0 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 24 May 2022 17:04:30 +0200
Subject: [PATCH 14/42] =?UTF-8?q?Improve=20model=20tests=C3=A9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/videomae/modeling_videomae.py      | 10 +--
 .../models/videomae/test_modeling_videomae.py | 81 ++++++++++++++++---
 2 files changed, 75 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index dd4d1454a8d1..bd7041cb0953 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -24,7 +24,7 @@
 import numpy as np
 import torch
 import torch.utils.checkpoint
-from torch import embedding, nn
+from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
@@ -153,13 +153,11 @@ def forward(self, pixel_values, bool_masked_pos):
         # ~bool_masked_pos means visible
         if bool_masked_pos is not None:
             batch_size, _, num_channels = embeddings.shape
-            print("Shape of bool_masked_pos:", bool_masked_pos.shape)
             embeddings = embeddings[~bool_masked_pos]
-            print("Shape of filtered embeddings:", embeddings.shape)
             embeddings = embeddings.reshape(batch_size, -1, num_channels)
 
         print("Shape of final embeddings:", embeddings.shape)
-        
+
         return embeddings
 
 
@@ -758,8 +756,6 @@ def forward(
 
         >>> outputs = model(**inputs)
         >>> loss = outputs.loss
-        >>> mask = outputs.mask
-        >>> ids_restore = outputs.ids_restore
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -859,7 +855,7 @@ def forward(
 
         print("Shape of logits:", logits.shape)
         print("Shape of labels:", labels.shape)
-        
+
         loss_fct = MSELoss()
         loss = loss_fct(logits, labels)
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 9da9dbf5d6e8..386ba1be3b0d 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -68,7 +68,7 @@ def __init__(
         attention_probs_dropout_prob=0.1,
         type_sequence_label_size=10,
         initializer_range=0.02,
-        num_labels=3,
+        mask_ratio=0.9,
         scope=None,
     ):
         self.parent = parent
@@ -89,11 +89,12 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
+        self.mask_ratio = mask_ratio
         self.scope = scope
 
-        # in VideoMAE, the number of tokens equals num_frames/tubelet_size * num_patches
-        num_patches = (image_size // patch_size) ** 2
-        self.seq_length = (num_frames // tubelet_size) * num_patches
+        # in VideoMAE, the number of tokens equals num_frames/tubelet_size * num_patches per frame
+        self.num_patches_per_frame = (image_size // patch_size) ** 2
+        self.seq_length = (num_frames // tubelet_size) * self.num_patches_per_frame
 
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor(
@@ -137,9 +138,17 @@ def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model = VideoMAEForPreTraining(config)
         model.to(torch_device)
         model.eval()
-        bool_masked_pos = torch.randint(0, 2, (self.batch_size, self.seq_length), dtype=torch.bool)
+        # important: each video needs to have the same number of masked patches
+        # hence we define a single mask, which we then repeat for each example in the batch
+        mask = torch.randint(0, 2, (self.seq_length,), dtype=torch.bool)
+        # num_masks_per_frame = int(self.mask_ratio * self.num_patches_per_frame)
+        # mask = torch.cat([torch.zeros(self.seq_length - self.num_masks_per_frame), torch.ones(self.mask_ratio * self.seq_length)], dtype=torch.bool)
+        bool_masked_pos = mask.expand(self.batch_size, -1)
         result = model(pixel_values, bool_masked_pos)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        # model only returns predictions for masked patches
+        num_masked_patches = mask.sum().item()
+        decoder_num_labels = 3 * self.tubelet_size * self.patch_size**2
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, num_masked_patches, decoder_num_labels))
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -172,10 +181,10 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
 
         if model_class == VideoMAEForPreTraining:
-            inputs_dict["bool_masked_pos"] = torch.zeros(
-                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+            inputs_dict["bool_masked_pos"] = torch.ones(
+                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.bool, device=torch_device
             )
-        
+
         if return_labels:
             if model_class in [
                 *get_values(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING),
@@ -228,6 +237,60 @@ def test_model_from_pretrained(self):
             model = VideoMAEModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            if hasattr(self.model_tester, "encoder_seq_length"):
+                seq_length = self.model_tester.encoder_seq_length
+                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
+                    seq_length = seq_length * self.model_tester.chunk_length
+            else:
+                seq_length = self.model_tester.seq_length
+
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [seq_length, self.model_tester.hidden_size],
+            )
+
+            if config.is_encoder_decoder:
+                hidden_states = outputs.decoder_hidden_states
+
+                self.assertIsInstance(hidden_states, (list, tuple))
+                self.assertEqual(len(hidden_states), expected_num_layers)
+                seq_len = getattr(self.model_tester, "seq_length", None)
+                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+
+                self.assertListEqual(
+                    list(hidden_states[0].shape[-2:]),
+                    [decoder_seq_length, self.model_tester.hidden_size],
+                )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            print("Model class:", model_class)
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():

From 29160e9c0b2a1c7c08368360c304c180b213af67 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 24 May 2022 17:31:59 +0200
Subject: [PATCH 15/42] Make all tests pass

---
 .../models/videomae/modeling_videomae.py      |  12 +-
 .../models/videomae/test_modeling_videomae.py | 110 +++++++++++++-----
 2 files changed, 80 insertions(+), 42 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index bd7041cb0953..20ae913f093f 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -147,8 +147,6 @@ def forward(self, pixel_values, bool_masked_pos):
         # add position embeddings
         embeddings = embeddings + self.position_embeddings.type_as(embeddings).to(embeddings.device).clone().detach()
 
-        print("Shape of embeddings:", embeddings.shape)
-
         # only keep visible patches
         # ~bool_masked_pos means visible
         if bool_masked_pos is not None:
@@ -156,8 +154,6 @@ def forward(self, pixel_values, bool_masked_pos):
             embeddings = embeddings[~bool_masked_pos]
             embeddings = embeddings.reshape(batch_size, -1, num_channels)
 
-        print("Shape of final embeddings:", embeddings.shape)
-
         return embeddings
 
 
@@ -478,10 +474,9 @@ class VideoMAEPreTrainedModel(PreTrainedModel):
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
 
-    # Copied from transformers.models.vit.modeling_vit.ViTPreTrainedModel._init_weights
     def _init_weights(self, module):
         """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
@@ -853,14 +848,11 @@ def forward(
             batch_size, _, num_channels = videos_patch.shape
             labels = videos_patch[bool_masked_pos].reshape(batch_size, -1, num_channels)
 
-        print("Shape of logits:", logits.shape)
-        print("Shape of labels:", labels.shape)
-
         loss_fct = MSELoss()
         loss = loss_fct(logits, labels)
 
         if not return_dict:
-            output = (logits,) + outputs[2:]
+            output = (logits,) + outputs[1:]
             return ((loss,) + output) if loss is not None else output
 
         return VideoMAEForPreTrainingOutput(
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 386ba1be3b0d..6df20ddd72a0 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -96,6 +96,9 @@ def __init__(
         self.num_patches_per_frame = (image_size // patch_size) ** 2
         self.seq_length = (num_frames // tubelet_size) * self.num_patches_per_frame
 
+        # use this variable to define bool_masked_pos
+        self.num_masks = int(mask_ratio * self.seq_length)
+
     def prepare_config_and_inputs(self):
         pixel_values = floats_tensor(
             [self.batch_size, self.num_frames, self.num_channels, self.image_size, self.image_size]
@@ -140,10 +143,10 @@ def create_and_check_for_pretraining(self, config, pixel_values, labels):
         model.eval()
         # important: each video needs to have the same number of masked patches
         # hence we define a single mask, which we then repeat for each example in the batch
-        mask = torch.randint(0, 2, (self.seq_length,), dtype=torch.bool)
-        # num_masks_per_frame = int(self.mask_ratio * self.num_patches_per_frame)
-        # mask = torch.cat([torch.zeros(self.seq_length - self.num_masks_per_frame), torch.ones(self.mask_ratio * self.seq_length)], dtype=torch.bool)
-        bool_masked_pos = mask.expand(self.batch_size, -1)
+        mask = torch.ones((self.num_masks,))
+        mask = torch.cat([mask, torch.zeros(self.seq_length - mask.size(0))])
+        bool_masked_pos = mask.expand(self.batch_size, -1).bool()
+
         result = model(pixel_values, bool_masked_pos)
         # model only returns predictions for masked patches
         num_masked_patches = mask.sum().item()
@@ -181,9 +184,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
 
         if model_class == VideoMAEForPreTraining:
-            inputs_dict["bool_masked_pos"] = torch.ones(
-                (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.bool, device=torch_device
-            )
+            # important: each video needs to have the same number of masked patches
+            # hence we define a single mask, which we then repeat for each example in the batch
+            mask = torch.ones((self.model_tester.num_masks,))
+            mask = torch.cat([mask, torch.zeros(self.model_tester.seq_length - mask.size(0))])
+            bool_masked_pos = mask.expand(self.model_tester.batch_size, -1).bool()
+            inputs_dict["bool_masked_pos"] = bool_masked_pos.to(torch_device)
 
         if return_labels:
             if model_class in [
@@ -237,6 +243,67 @@ def test_model_from_pretrained(self):
             model = VideoMAEModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    def test_attention_outputs(self):
+        if not self.has_attentions:
+            pass
+
+        else:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.return_dict = True
+
+            for model_class in self.all_model_classes:
+                num_visible_patches = self.model_tester.seq_length - self.model_tester.num_masks
+                seq_len = (
+                    num_visible_patches if model_class == VideoMAEForPreTraining else self.model_tester.seq_length
+                )
+
+                inputs_dict["output_attentions"] = True
+                inputs_dict["output_hidden_states"] = False
+                config.return_dict = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+                attentions = outputs.attentions
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+                # check that output_attentions also work using config
+                del inputs_dict["output_attentions"]
+                config.output_attentions = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+                attentions = outputs.attentions
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+                self.assertListEqual(
+                    list(attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+                out_len = len(outputs)
+
+                # Check attention is always last and order is fine
+                inputs_dict["output_attentions"] = True
+                inputs_dict["output_hidden_states"] = True
+                model = model_class(config)
+                model.to(torch_device)
+                model.eval()
+                with torch.no_grad():
+                    outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+                self.assertEqual(out_len + 1, len(outputs))
+
+                self_attentions = outputs.attentions
+
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(self_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, seq_len, seq_len],
+                )
+
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
@@ -246,42 +313,21 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             with torch.no_grad():
                 outputs = model(**self._prepare_for_class(inputs_dict, model_class))
 
-            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
-
-            expected_num_layers = getattr(
-                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
-            )
+            hidden_states = outputs.hidden_states
+            expected_num_layers = self.model_tester.num_hidden_layers + 1
             self.assertEqual(len(hidden_states), expected_num_layers)
 
-            if hasattr(self.model_tester, "encoder_seq_length"):
-                seq_length = self.model_tester.encoder_seq_length
-                if hasattr(self.model_tester, "chunk_length") and self.model_tester.chunk_length > 1:
-                    seq_length = seq_length * self.model_tester.chunk_length
-            else:
-                seq_length = self.model_tester.seq_length
+            num_visible_patches = self.model_tester.seq_length - self.model_tester.num_masks
+            seq_length = num_visible_patches if model_class == VideoMAEForPreTraining else self.model_tester.seq_length
 
             self.assertListEqual(
                 list(hidden_states[0].shape[-2:]),
                 [seq_length, self.model_tester.hidden_size],
             )
 
-            if config.is_encoder_decoder:
-                hidden_states = outputs.decoder_hidden_states
-
-                self.assertIsInstance(hidden_states, (list, tuple))
-                self.assertEqual(len(hidden_states), expected_num_layers)
-                seq_len = getattr(self.model_tester, "seq_length", None)
-                decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
-
-                self.assertListEqual(
-                    list(hidden_states[0].shape[-2:]),
-                    [decoder_seq_length, self.model_tester.hidden_size],
-                )
-
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
-            print("Model class:", model_class)
             inputs_dict["output_hidden_states"] = True
             check_hidden_states_output(inputs_dict, config, model_class)
 

From a1fed7f2f60a05152416c61ebc522c7a1dfea5aa Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 3 Jun 2022 13:29:22 +0200
Subject: [PATCH 16/42] Add VideoMAE to main README

---
 README.md                | 1 +
 README_ko.md             | 1 +
 README_zh-hans.md        | 1 +
 README_zh-hant.md        | 1 +
 docs/source/en/index.mdx | 1 +
 5 files changed, 5 insertions(+)

diff --git a/README.md b/README.md
index 99c90a3916dd..0cda209bdfc3 100644
--- a/README.md
+++ b/README.md
@@ -368,6 +368,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
diff --git a/README_ko.md b/README_ko.md
index adfaefddf628..21471ae03f60 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -324,6 +324,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 0e51441b407b..f945a82a467e 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -348,6 +348,7 @@ conda install -c huggingface transformers
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
+1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 1fbff9fa1741..7273fd53171d 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -360,6 +360,7 @@ conda install -c huggingface transformers
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index d3d5f8d411b5..5c0d51d8b7af 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -166,6 +166,7 @@ The library currently contains JAX, PyTorch and TensorFlow implementations, pret
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[VAN](model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.

From bda88401509708dbb33e4dc423b40623f1354f2a Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 6 Jun 2022 14:32:57 +0200
Subject: [PATCH 17/42] Add tests for VideoMAEFeatureExtractor

---
 .../videomae/feature_extraction_videomae.py   |  19 +-
 .../test_feature_extraction_videomae.py       | 204 ++++++++++++++++++
 tests/test_feature_extraction_common.py       |  98 ++++++---
 3 files changed, 288 insertions(+), 33 deletions(-)
 create mode 100644 tests/models/videomae/test_feature_extraction_videomae.py

diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index e6a47a44720f..3e808aeb52ba 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -20,7 +20,13 @@
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, ImageFeatureExtractionMixin, ImageInput
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ImageFeatureExtractionMixin,
+    ImageInput,
+    is_torch_tensor,
+)
 from ...utils import TensorType, logging
 
 
@@ -124,16 +130,19 @@ def __call__(
 
         # Check that videos have a valid type
         if isinstance(videos, (list, tuple)):
-            if isinstance(videos[0], (Image.Image, np.ndarray)):
+            if isinstance(videos[0], (Image.Image, np.ndarray)) or is_torch_tensor(videos[0]):
                 valid_videos = True
-            elif isinstance(videos[0], (list, tuple)) and isinstance(videos[0][0], (Image.Image, np.ndarray)):
+            elif isinstance(videos[0], (list, tuple)) and (
+                isinstance(videos[0][0], (Image.Image, np.ndarray)) or is_torch_tensor(videos[0][0])
+            ):
                 valid_videos = True
                 is_batched = True
 
         if not valid_videos:
             raise ValueError(
-                "Videos must of type `List[PIL.Image.Image]`, `List[np.ndarray]` (single example), "
-                "`List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]` (batch of examples)."
+                "Videos must of type `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]` (single"
+                " example), `List[List[PIL.Image.Image]]`, `List[List[np.ndarray]]`, `List[List[torch.Tensor]]` (batch"
+                " of examples)."
             )
 
         if not is_batched:
diff --git a/tests/models/videomae/test_feature_extraction_videomae.py b/tests/models/videomae/test_feature_extraction_videomae.py
new file mode 100644
index 000000000000..971b4da729ea
--- /dev/null
+++ b/tests/models/videomae/test_feature_extraction_videomae.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_feature_extraction_common import FeatureExtractionSavingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import VideoMAEFeatureExtractor
+
+
+class VideoMAEFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        num_channels=3,
+        num_frames=10,
+        image_size=18,
+        min_resolution=30,
+        max_resolution=400,
+        do_resize=True,
+        size=18,
+        do_normalize=True,
+        image_mean=[0.5, 0.5, 0.5],
+        image_std=[0.5, 0.5, 0.5],
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_frames = num_frames
+        self.image_size = image_size
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.do_resize = do_resize
+        self.size = size
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+        }
+
+
+@require_torch
+@require_vision
+class VideoMAEFeatureExtractionTest(FeatureExtractionSavingTestMixin, unittest.TestCase):
+
+    feature_extraction_class = VideoMAEFeatureExtractor if is_vision_available() else None
+
+    def setUp(self):
+        self.feature_extract_tester = VideoMAEFeatureExtractionTester(self)
+
+    @property
+    def feat_extract_dict(self):
+        return self.feature_extract_tester.prepare_feat_extract_dict()
+
+    def test_feat_extract_properties(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        self.assertTrue(hasattr(feature_extractor, "image_mean"))
+        self.assertTrue(hasattr(feature_extractor, "image_std"))
+        self.assertTrue(hasattr(feature_extractor, "do_normalize"))
+        self.assertTrue(hasattr(feature_extractor, "do_resize"))
+        self.assertTrue(hasattr(feature_extractor, "size"))
+
+    def test_batch_feature(self):
+        pass
+
+    def test_call_pil(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PIL videos
+        video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input
+        encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.feature_extract_tester.num_frames,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_frames,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random numpy tensors
+        video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.feature_extract_tester.num_frames,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_frames,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+    def test_call_pytorch(self):
+        # Initialize feature_extractor
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        # create random PyTorch tensors
+        video_inputs = prepare_video_inputs(self.feature_extract_tester, equal_resolution=False, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], torch.Tensor)
+
+        # Test not batched input
+        for video in video_inputs:
+            print(video[0].shape)
+        encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.feature_extract_tester.num_frames,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
+
+        # Test batched
+        encoded_videos = feature_extractor(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.feature_extract_tester.batch_size,
+                self.feature_extract_tester.num_frames,
+                self.feature_extract_tester.num_channels,
+                self.feature_extract_tester.size,
+                self.feature_extract_tester.size,
+            ),
+        )
diff --git a/tests/test_feature_extraction_common.py b/tests/test_feature_extraction_common.py
index 16ab3c645954..a822b75cc5eb 100644
--- a/tests/test_feature_extraction_common.py
+++ b/tests/test_feature_extraction_common.py
@@ -48,49 +48,91 @@
 def prepare_image_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
     """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True,
     or a list of PyTorch tensors if one specifies torchify=True.
+
+    One can specify whether the images are of the same resolution or not.
     """
 
     assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
 
-    if equal_resolution:
-        image_inputs = []
-        for i in range(feature_extract_tester.batch_size):
-            image_inputs.append(
-                np.random.randint(
-                    255,
-                    size=(
-                        feature_extract_tester.num_channels,
-                        feature_extract_tester.max_resolution,
-                        feature_extract_tester.max_resolution,
-                    ),
-                    dtype=np.uint8,
-                )
-            )
-    else:
-        image_inputs = []
-
-        # To avoid getting image width/height 0
-        min_resolution = feature_extract_tester.min_resolution
-        if getattr(feature_extract_tester, "size_divisor", None):
-            # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
-            min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
-
-        for i in range(feature_extract_tester.batch_size):
+    image_inputs = []
+    for i in range(feature_extract_tester.batch_size):
+        if equal_resolution:
+            width = height = feature_extract_tester.max_resolution
+        else:
+            # To avoid getting image width/height 0
+            min_resolution = feature_extract_tester.min_resolution
+            if getattr(feature_extract_tester, "size_divisor", None):
+                # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor`
+                min_resolution = max(feature_extract_tester.size_divisor, min_resolution)
             width, height = np.random.choice(np.arange(min_resolution, feature_extract_tester.max_resolution), 2)
-            image_inputs.append(
-                np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8)
+        image_inputs.append(
+            np.random.randint(
+                255,
+                size=(
+                    feature_extract_tester.num_channels,
+                    width,
+                    height,
+                ),
+                dtype=np.uint8,
             )
+        )
 
     if not numpify and not torchify:
         # PIL expects the channel dimension as last dimension
-        image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs]
+        image_inputs = [Image.fromarray(np.moveaxis(image, 0, -1)) for image in image_inputs]
 
     if torchify:
-        image_inputs = [torch.from_numpy(x) for x in image_inputs]
+        image_inputs = [torch.from_numpy(image) for image in image_inputs]
 
     return image_inputs
 
 
+def prepare_video(feature_extract_tester, width=10, height=10, numpify=False, torchify=False):
+    """This function prepares a video as a list of PIL images/NumPy arrays/PyTorch tensors."""
+
+    video = []
+    for i in range(feature_extract_tester.num_frames):
+        video.append(np.random.randint(255, size=(feature_extract_tester.num_channels, width, height), dtype=np.uint8))
+
+    if not numpify and not torchify:
+        # PIL expects the channel dimension as last dimension
+        video = [Image.fromarray(np.moveaxis(frame, 0, -1)) for frame in video]
+
+    if torchify:
+        video = [torch.from_numpy(frame) for frame in video]
+
+    return video
+
+
+def prepare_video_inputs(feature_extract_tester, equal_resolution=False, numpify=False, torchify=False):
+    """This function prepares a batch of videos: a list of list of PIL images, or a list of list of numpy arrays if
+    one specifies numpify=True, or a list of list of PyTorch tensors if one specifies torchify=True.
+
+    One can specify whether the videos are of the same resolution or not.
+    """
+
+    assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
+
+    video_inputs = []
+    for i in range(feature_extract_tester.batch_size):
+        if equal_resolution:
+            width = height = feature_extract_tester.max_resolution
+        else:
+            width, height = np.random.choice(
+                np.arange(feature_extract_tester.min_resolution, feature_extract_tester.max_resolution), 2
+            )
+            video = prepare_video(
+                feature_extract_tester=feature_extract_tester,
+                width=width,
+                height=height,
+                numpify=numpify,
+                torchify=torchify,
+            )
+        video_inputs.append(video)
+
+    return video_inputs
+
+
 class FeatureExtractionSavingTestMixin:
     def test_feat_extract_to_json_string(self):
         feat_extract = self.feature_extraction_class(**self.feat_extract_dict)

From 63b6e7c8db3afa760ba7ad8626fa88250a3f0ccc Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 20 Jun 2022 11:44:35 +0200
Subject: [PATCH 18/42] Add integration test

---
 .../models/auto/feature_extraction_auto.py    |  2 +-
 .../models/videomae/test_modeling_videomae.py | 39 ++++++++++++++-----
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index f77633e56c60..ed526369df4f 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -68,8 +68,8 @@
         ("swin", "ViTFeatureExtractor"),
         ("swinv2", "ViTFeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),
-        ("vilt", "ViltFeatureExtractor"),
         ("videomae", "ViTFeatureExtractor"),
+        ("vilt", "ViltFeatureExtractor"),
         ("vit", "ViTFeatureExtractor"),
         ("vit_mae", "ViTFeatureExtractor"),
         ("wav2vec2", "Wav2Vec2FeatureExtractor"),
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 6df20ddd72a0..bdeedcb8e0ed 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -19,6 +19,9 @@
 import inspect
 import unittest
 
+import numpy as np
+
+from huggingface_hub import hf_hub_download
 from transformers import VideoMAEConfig
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, require_vision, slow, torch_device
@@ -42,8 +45,6 @@
 
 
 if is_vision_available():
-    from PIL import Image
-
     from transformers import VideoMAEFeatureExtractor
 
 
@@ -338,10 +339,12 @@ def check_hidden_states_output(inputs_dict, config, model_class):
             check_hidden_states_output(inputs_dict, config, model_class)
 
 
-# We will verify our results on an image of cute cats
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
+def prepare_video():
+    file = hf_hub_download(repo_id="datasets/hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy")
+    video = np.load(file)
+    return list(video)
 
 
 @require_torch
@@ -349,8 +352,26 @@ def prepare_img():
 class VideoMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base") if is_vision_available() else None
+        # TODO update to appropriate organization
+        return VideoMAEFeatureExtractor() if is_vision_available() else None
 
     @slow
-    def test_inference_for_pretraining(self):
-        raise NotImplementedError("To do")
+    def test_inference_for_video_classification(self):
+        # TODO update to appropriate organization
+        model = VideoMAEForVideoClassification.from_pretrained("nielsr/videomae-base").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        video = prepare_video()
+        inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size((1, 400))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = torch.tensor([0.7666, -0.2265, -0.5551]).to(torch_device)
+
+        self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))

From 2f4de0e9c4049c984627d96b67d9e7e1b697d547 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 22 Jun 2022 14:13:08 +0200
Subject: [PATCH 19/42] Improve conversion script

---
 .../videomae/convert_videomae_to_pytorch.py   | 93 ++++++++++++-------
 1 file changed, 60 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index b5357847a7ab..42de912ec08c 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -17,13 +17,14 @@
 import argparse
 import json
 
+import numpy as np
 import torch
 
 from huggingface_hub import hf_hub_download
-from transformers import VideoMAEConfig, VideoMAEForVideoClassification
+from transformers import VideoMAEConfig, VideoMAEFeatureExtractor, VideoMAEForVideoClassification
 
 
-def get_videomae_config(checkpoint_path):
+def get_videomae_config(checkpoint_path, model_name):
     config = VideoMAEConfig()
 
     if "large" in checkpoint_path:
@@ -32,9 +33,15 @@ def get_videomae_config(checkpoint_path):
         config.num_hidden_layers = 24
         config.num_attention_heads = 16
 
-    config.num_labels = 400
     repo_id = "datasets/huggingface/label-files"
-    filename = "kinetics400-id2label.json"
+    if "kinetics" in model_name:
+        config.num_labels = 400
+        filename = "kinetics400-id2label.json"
+    elif "ssv2" in model_name:
+        config.num_labels = 174
+        filename = "something-something-v2-id2label.json"
+    else:
+        raise ValueError("Model name should either contain 'kinetics' or 'ssv2'.")
     id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
     id2label = {int(k): v for k, v in id2label.items()}
     config.id2label = id2label
@@ -127,8 +134,16 @@ def convert_state_dict(orig_state_dict, config):
     return orig_state_dict
 
 
-def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_to_hub):
-    config = get_videomae_config(checkpoint_path)
+# We will verify our results on a video of eating spaghetti
+# Frame indices used: [164 168 172 176 181 185 189 193 198 202 206 210 215 219 223 227]
+def prepare_video():
+    file = hf_hub_download(repo_id="datasets/hf-internal-testing/spaghetti-video", filename="eating_spaghetti.npy")
+    video = np.load(file)
+    return list(video)
+
+
+def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model_name, push_to_hub):
+    config = get_videomae_config(checkpoint_path, model_name)
 
     model = VideoMAEForVideoClassification(config)
 
@@ -139,39 +154,48 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_
     model.eval()
 
     # forward pass
-    pixel_values = torch.load("/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/eating_spaghetti_video.pt")
-    pixel_values = pixel_values.permute(0, 2, 1, 3, 4)
-    outputs = model(pixel_values)
+    feature_extractor = VideoMAEFeatureExtractor()
+    video = prepare_video()
+    inputs = feature_extractor(video, return_tensors="pt")
+    outputs = model(**inputs)
     logits = outputs.logits
 
+    print("First values of logits:", logits[0, :3])
+
     predicted_class_idx = logits.argmax(-1).item()
     print("Predicted class:", model.config.id2label[predicted_class_idx])
 
-    # if "large" in checkpoint_url:
-    #     expected_slice = torch.tensor(
-    #         [[-0.7309, -0.7128, -1.0169], [-1.0161, -0.9058, -1.1878], [-1.0478, -0.9411, -1.1911]]
-    #     )
-    # elif "huge" in checkpoint_url:
-    #     expected_slice = torch.tensor(
-    #         [[-1.1599, -0.9199, -1.2221], [-1.1952, -0.9269, -1.2307], [-1.2143, -0.9337, -1.2262]]
-    #     )
-    # else:
-    #     expected_slice = torch.tensor(
-    #         [[-0.9192, -0.8481, -1.1259], [-1.1349, -1.0034, -1.2599], [-1.1757, -1.0429, -1.2726]]
-    #     )
-
-    # # verify logits
-    # assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
-
-    # print(f"Saving model to {pytorch_dump_folder_path}")
-    # model.save_pretrained(pytorch_dump_folder_path)
-
-    # print(f"Saving feature extractor to {pytorch_dump_folder_path}")
-    # feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    model_names = [
+        # Kinetics-400 checkpoints
+        "videomae-base-short",
+        "videomae-base-short-finetuned-kinetics",
+        "videomae-base",
+        "videomae-base-finetuned-kinetics",
+        "videomae-large",
+        "videomae-large-finetuned-kinetics",
+        # Something-Something-v2 checkpoints
+        "videomae-base-short-ssv2",
+        "videomae-base-short-finetuned-ssv2",
+        "videomae-base-ssv2",
+        "videomae-base-finetuned-ssv2",
+    ]
+    if model_name not in model_names:
+        raise ValueError("Model name not supported.")
+
+    if model_name == "videomae-base-finetuned-kinetics":
+        expected_slice = torch.tensor([0.7666, -0.2265, -0.5551])
+    elif model_name == "videomae-base-finetuned-ssv2":
+        expected_slice = torch.tensor([-0.1354, -0.4494, -0.4979])
+
+    # verify logits
+    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
+
+    print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
+    feature_extractor.save_pretrained(pytorch_dump_folder_path)
+    model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing to the hub...")
-        model_name = "videomae-base"
         model.push_to_hub(model_name, organization="nielsr")
 
 
@@ -180,16 +204,19 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, push_
     # Required parameters
     parser.add_argument(
         "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/checkpoint.pth",
+        default="/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/Kinetics-400/checkpoint.pth",
         type=str,
         help="Path of the original PyTorch checkpoint you'd like to convert.",
     )
     parser.add_argument(
         "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
     )
+    parser.add_argument(
+        "--model_name", default="videomae-base-finetuned-kinetics", type=str, help="Name of the model."
+    )
     parser.add_argument(
         "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
     )
 
     args = parser.parse_args()
-    convert_videomae_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.push_to_hub)
+    convert_videomae_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)

From 72c46d4131af1c65760b772ca7288fc065e3a6eb Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 22 Jun 2022 14:28:51 +0200
Subject: [PATCH 20/42] Rename patch embedding class

---
 src/transformers/models/videomae/modeling_videomae.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 20ae913f093f..56383387b2e5 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -127,7 +127,7 @@ class VideoMAEEmbeddings(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.patch_embeddings = PatchEmbeddings(
+        self.patch_embeddings = VideoMAEPatchEmbeddings(
             image_size=config.image_size,
             patch_size=config.patch_size,
             num_channels=config.num_channels,
@@ -157,7 +157,7 @@ def forward(self, pixel_values, bool_masked_pos):
         return embeddings
 
 
-class PatchEmbeddings(nn.Module):
+class VideoMAEPatchEmbeddings(nn.Module):
     """
     Video to Patch Embedding. This module turns a batch of videos of shape (batch_size, num_frames, num_channels,
     height, width) into a tensor of shape (batch_size, seq_len, hidden_size) to be consumed by a Transformer encoder.

From 7b4d5d148c67313b11f74c58a6d008e9279c99fe Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 22 Jun 2022 14:43:35 +0200
Subject: [PATCH 21/42] Remove VideoMAELayer from init

---
 src/transformers/__init__.py               | 2 --
 src/transformers/utils/dummy_pt_objects.py | 7 -------
 2 files changed, 9 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 26c86ae6a81e..de9868d55ffd 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1878,7 +1878,6 @@
         [
             "VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST",
             "VideoMAEForPreTraining",
-            "VideoMAELayer",
             "VideoMAEModel",
             "VideoMAEPreTrainedModel",
             "VideoMAEForVideoClassification",
@@ -4358,7 +4357,6 @@
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
             VideoMAEForVideoClassification,
-            VideoMAELayer,
             VideoMAEModel,
             VideoMAEPreTrainedModel,
         )
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index a9eed7566e5f..0cfe20c21baf 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -4773,13 +4773,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class VideoMAELayer(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-
 class VideoMAEModel(metaclass=DummyObject):
     _backends = ["torch"]
 

From f34e2bb8f1a374fd128f9f8fa9b7f3d6e28d9ce7 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 22 Jun 2022 17:30:23 +0200
Subject: [PATCH 22/42] Update design of patch embeddings

---
 .../models/videomae/modeling_videomae.py      | 57 ++++++++++---------
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 56383387b2e5..9c2e5aecfb77 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -127,14 +127,7 @@ class VideoMAEEmbeddings(nn.Module):
     def __init__(self, config):
         super().__init__()
 
-        self.patch_embeddings = VideoMAEPatchEmbeddings(
-            image_size=config.image_size,
-            patch_size=config.patch_size,
-            num_channels=config.num_channels,
-            embed_dim=config.hidden_size,
-            num_frames=config.num_frames,
-            tubelet_size=config.tubelet_size,
-        )
+        self.patch_embeddings = VideoMAEPatchEmbeddings(config)
         self.num_patches = self.patch_embeddings.num_patches
         # fixed sin-cos embedding
         self.position_embeddings = get_sinusoid_encoding_table(self.num_patches, config.hidden_size)
@@ -167,7 +160,16 @@ class VideoMAEPatchEmbeddings(nn.Module):
 
     """
 
-    def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768, num_frames=16, tubelet_size=2):
+    def __init__(self, config):
+        image_size, patch_size, num_channels, hidden_size, num_frames, tubelet_size = (
+            config.image_size,
+            config.patch_size,
+            config.num_channels,
+            config.hidden_size,
+            config.num_frames,
+            config.tubelet_size,
+        )
+
         super().__init__()
         image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
         patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
@@ -177,16 +179,21 @@ def __init__(self, image_size=224, patch_size=16, num_channels=3, embed_dim=768,
         num_patches = (
             (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) * (num_frames // self.tubelet_size)
         )
+        self.num_channels = num_channels
         self.num_patches = num_patches
         self.projection = nn.Conv3d(
             in_channels=num_channels,
-            out_channels=embed_dim,
+            out_channels=hidden_size,
             kernel_size=(self.tubelet_size, patch_size[0], patch_size[1]),
             stride=(self.tubelet_size, patch_size[0], patch_size[1]),
         )
 
     def forward(self, pixel_values):
         batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
         if height != self.image_size[0] or width != self.image_size[1]:
             raise ValueError(
                 f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
@@ -573,16 +580,14 @@ def forward(
 
         ```python
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEModel
-        >>> from PIL import Image
-        >>> import requests
+        >>> import torch
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> video = torch.randn(1, 16, 3, 224, 224)
 
         >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base")
         >>> model = VideoMAEModel.from_pretrained("nanjing/videomae-base")
 
-        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> inputs = feature_extractor(video, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
@@ -738,18 +743,17 @@ def forward(
         Examples:
         ```python
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForPreTraining
-        >>> from PIL import Image
-        >>> import requests
+        >>> import numpy as np
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> video = np.random.randn(16, 3, 224, 224).tolist()
 
         >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/vit-mae-base")
         >>> model = VideoMAEForPreTraining.from_pretrained("nanjing/vit-mae-base")
 
-        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> pixel_values = feature_extractor(video, return_tensors="pt").pixel_values
+        >>> bool_masked_pos = ...
 
-        >>> outputs = model(**inputs)
+        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
         >>> loss = outputs.loss
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -780,8 +784,7 @@ def forward(
         decoder_outputs = self.decoder(x_full, pos_emd_mask.shape[1])  # [B, N_mask, 3 * 16 * 16]
         logits = decoder_outputs.logits
 
-        # TODO compute loss
-        # TODO check correct format of videos! (B, T, C, H, W)
+        # TODO verify loss computation
         loss = None
         with torch.no_grad():
             # calculate the labels to be predicted
@@ -905,16 +908,14 @@ def forward(
 
         ```python
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
-        >>> from PIL import Image
-        >>> import requests
+        >>> import numpy as np
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> video = np.random.randn(16, 3, 224, 224).tolist()
 
         >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base")
         >>> model = VideoMAEForVideoClassification.from_pretrained("nanjing/videomae-base")
 
-        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> inputs = feature_extractor(video, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""

From b052c1c3c05faab9c24e9651f48abc5629dc3f37 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 27 Jun 2022 15:23:54 +0200
Subject: [PATCH 23/42] Improve comments

---
 .../models/videomae/modeling_videomae.py      | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 9c2e5aecfb77..68ab7a9fcb6d 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -630,7 +630,7 @@ class VideoMAEDecoder(nn.Module):
     def __init__(self, config, num_patches):
         super().__init__()
 
-        decoder_num_labels = 3 * config.tubelet_size * config.patch_size**2
+        decoder_num_labels = config.num_channels * config.tubelet_size * config.patch_size**2
 
         decoder_config = deepcopy(config)
         decoder_config.hidden_size = config.decoder_hidden_size
@@ -768,20 +768,26 @@ def forward(
         )
 
         sequence_output = outputs[0]
-        sequence_output = self.encoder_to_decoder(sequence_output)  # [B, N_vis, C_d]
+        sequence_output = self.encoder_to_decoder(
+            sequence_output
+        )  # [batch_size, num_visible_patches, decoder_hidden_size]
         batch_size, seq_len, num_channels = sequence_output.shape
 
-        # we don't unshuffle the correct visible token order,
-        # but shuffle the pos embedding accordingly.
-        # TODO check for bool_masked_pos to be available
+        # we don't unshuffle the correct visible token order, but shuffle the position embeddings accordingly.
+        if bool_masked_pos is None:
+            raise ValueError("One must provided a boolean mask ")
         expanded_position_embeddings = self.position_embeddings.expand(batch_size, -1, -1).type_as(pixel_values)
         expanded_position_embeddings = expanded_position_embeddings.to(pixel_values.device).clone().detach()
-        pos_emd_vis = expanded_position_embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
-        pos_emd_mask = expanded_position_embeddings[bool_masked_pos].reshape(batch_size, -1, num_channels)
+        pos_emb_visible = expanded_position_embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
+        pos_emb_mask = expanded_position_embeddings[bool_masked_pos].reshape(batch_size, -1, num_channels)
 
-        x_full = torch.cat([sequence_output + pos_emd_vis, self.mask_token + pos_emd_mask], dim=1)  # [B, N, C_d]
+        x_full = torch.cat(
+            [sequence_output + pos_emb_visible, self.mask_token + pos_emb_mask], dim=1
+        )  # [batch_size, num_patches, decoder_hidden_size]
 
-        decoder_outputs = self.decoder(x_full, pos_emd_mask.shape[1])  # [B, N_mask, 3 * 16 * 16]
+        decoder_outputs = self.decoder(
+            x_full, pos_emb_mask.shape[1]
+        )  # [batch_size, num_masked_patches, num_channels * patch_size * patch_size]
         logits = decoder_outputs.logits
 
         # TODO verify loss computation

From 0e69ac3890f903ef96bf5536a87551b27f5839db Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 7 Jul 2022 11:56:20 +0200
Subject: [PATCH 24/42] Improve conversion script

---
 .../models/videomae/configuration_videomae.py |  4 +-
 .../videomae/convert_videomae_to_pytorch.py   | 84 ++++++++++++-------
 2 files changed, 56 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index 3ded151570a7..97a30e9003b8 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -74,7 +74,7 @@ class VideoMAEConfig(PretrainedConfig):
             Number of attention heads for each attention layer in the decoder.
         decoder_hidden_size (`int`, *optional*, defaults to 384):
             Dimensionality of the decoder.
-        decoder_num_hidden_layers (`int`, *optional*, defaults to 12):
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 4):
             Number of hidden layers in the decoder.
         decoder_intermediate_size (`int`, *optional*, defaults to 1536):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
@@ -117,7 +117,7 @@ def __init__(
         use_mean_pooling=True,
         decoder_num_attention_heads=6,
         decoder_hidden_size=384,
-        decoder_num_hidden_layers=12,
+        decoder_num_hidden_layers=4,
         decoder_intermediate_size=1536,
         norm_pix_loss=True,
         **kwargs
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 42de912ec08c..a15cc59fc680 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -21,7 +21,12 @@
 import torch
 
 from huggingface_hub import hf_hub_download
-from transformers import VideoMAEConfig, VideoMAEFeatureExtractor, VideoMAEForVideoClassification
+from transformers import (
+    VideoMAEConfig,
+    VideoMAEFeatureExtractor,
+    VideoMAEForPreTraining,
+    VideoMAEForVideoClassification,
+)
 
 
 def get_videomae_config(checkpoint_path, model_name):
@@ -34,23 +39,26 @@ def get_videomae_config(checkpoint_path, model_name):
         config.num_attention_heads = 16
 
     repo_id = "datasets/huggingface/label-files"
-    if "kinetics" in model_name:
-        config.num_labels = 400
-        filename = "kinetics400-id2label.json"
-    elif "ssv2" in model_name:
-        config.num_labels = 174
-        filename = "something-something-v2-id2label.json"
-    else:
-        raise ValueError("Model name should either contain 'kinetics' or 'ssv2'.")
-    id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
-    id2label = {int(k): v for k, v in id2label.items()}
-    config.id2label = id2label
-    config.label2id = {v: k for k, v in id2label.items()}
+    if "finetuned" in model_name:
+        if "kinetics" in model_name:
+            config.num_labels = 400
+            filename = "kinetics400-id2label.json"
+        elif "ssv2" in model_name:
+            config.num_labels = 174
+            filename = "something-something-v2-id2label.json"
+        else:
+            raise ValueError("Model name should either contain 'kinetics' or 'ssv2' in case it's fine-tuned.")
+        id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
+        id2label = {int(k): v for k, v in id2label.items()}
+        config.id2label = id2label
+        config.label2id = {v: k for k, v in id2label.items()}
 
     return config
 
 
 def rename_key(name):
+    if "encoder." in name:
+        name = name.replace("encoder.", "")
     if "cls_token" in name:
         name = name.replace("cls_token", "videomae.embeddings.cls_token")
     if "mask_token" in name:
@@ -63,8 +71,8 @@ def rename_key(name):
         name = name.replace("patch_embed.proj", "videomae.embeddings.patch_embeddings.projection")
     if "patch_embed.norm" in name:
         name = name.replace("patch_embed.norm", "videomae.embeddings.norm")
-    if "decoder_blocks" in name:
-        name = name.replace("decoder_blocks", "decoder.decoder_layers")
+    if "decoder.blocks" in name:
+        name = name.replace("decoder.blocks", "decoder.decoder_layers")
     if "blocks" in name:
         name = name.replace("blocks", "videomae.encoder.layer")
     if "attn.proj" in name:
@@ -100,13 +108,19 @@ def rename_key(name):
 def convert_state_dict(orig_state_dict, config):
     for key in orig_state_dict.copy().keys():
         val = orig_state_dict.pop(key)
-
+        
+        if key.startswith("encoder."):
+            key = key.replace("encoder.", "")
+        
         if "qkv" in key:
             key_split = key.split(".")
-            layer_num = int(key_split[1])
-            if "decoder_blocks" in key:
+            print("Key:", key)
+            if key.startswith("decoder.blocks"):
                 dim = config.decoder_hidden_size
+                layer_num = int(key_split[2])
                 prefix = "decoder.decoder_layers."
+                print("Old name:", key)
+                print("New name:", f"{prefix}{layer_num}.attention.attention.query.weight")
                 if "weight" in key:
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
@@ -117,6 +131,7 @@ def convert_state_dict(orig_state_dict, config):
                 #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
             else:
                 dim = config.hidden_size
+                layer_num = int(key_split[1])
                 prefix = "videomae.encoder.layer."
                 if "weight" in key:
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
@@ -145,9 +160,16 @@ def prepare_video():
 def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model_name, push_to_hub):
     config = get_videomae_config(checkpoint_path, model_name)
 
-    model = VideoMAEForVideoClassification(config)
+    if "finetuned" in model_name:
+        model = VideoMAEForVideoClassification(config)
+    else:
+        model = VideoMAEForPreTraining(config)
 
-    state_dict = torch.load(checkpoint_path, map_location="cpu")["module"]
+    state_dict = torch.load(checkpoint_path, map_location="cpu")
+    if "finetuned" in model_name:
+        state_dict = state_dict["module"]
+    else:
+        state_dict = state_dict["model"]
     new_state_dict = convert_state_dict(state_dict, config)
 
     model.load_state_dict(new_state_dict)
@@ -160,20 +182,17 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     outputs = model(**inputs)
     logits = outputs.logits
 
-    print("First values of logits:", logits[0, :3])
-
-    predicted_class_idx = logits.argmax(-1).item()
-    print("Predicted class:", model.config.id2label[predicted_class_idx])
+    print("Shape of logits:", logits.shape)
 
     model_names = [
-        # Kinetics-400 checkpoints
+        # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
         "videomae-base-short",
         "videomae-base-short-finetuned-kinetics",
         "videomae-base",
         "videomae-base-finetuned-kinetics",
         "videomae-large",
         "videomae-large-finetuned-kinetics",
-        # Something-Something-v2 checkpoints
+        # Something-Something-v2 checkpoints (short = pretrained only for 800 epochs instead of 2400)
         "videomae-base-short-ssv2",
         "videomae-base-short-finetuned-ssv2",
         "videomae-base-ssv2",
@@ -189,10 +208,12 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
 
     # verify logits
     assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
+    print("Logits ok!")
 
-    print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
-    feature_extractor.save_pretrained(pytorch_dump_folder_path)
-    model.save_pretrained(pytorch_dump_folder_path)
+    if pytorch_dump_folder_path is not None:
+        print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
+        feature_extractor.save_pretrained(pytorch_dump_folder_path)
+        model.save_pretrained(pytorch_dump_folder_path)
 
     if push_to_hub:
         print("Pushing to the hub...")
@@ -204,7 +225,10 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     # Required parameters
     parser.add_argument(
         "--checkpoint_path",
-        default="/Users/nielsrogge/Documents/VideoMAE/Original checkpoints/Kinetics-400/checkpoint.pth",
+        default=(
+            "/Users/nielsrogge/Documents/VideoMAE/Original"
+            " checkpoints/Kinetics-400/videomae-base-finetuned-kinetics/checkpoint.pth"
+        ),
         type=str,
         help="Path of the original PyTorch checkpoint you'd like to convert.",
     )

From 1a14bea1e856ddc61d8877696b388b541d6bd767 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 7 Jul 2022 12:17:45 +0200
Subject: [PATCH 25/42] Improve conversion script

---
 .../models/videomae/convert_videomae_to_pytorch.py    | 11 +++--------
 src/transformers/models/videomae/modeling_videomae.py | 10 ++++++----
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index a15cc59fc680..6a28b3ae5dab 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -61,8 +61,6 @@ def rename_key(name):
         name = name.replace("encoder.", "")
     if "cls_token" in name:
         name = name.replace("cls_token", "videomae.embeddings.cls_token")
-    if "mask_token" in name:
-        name = name.replace("mask_token", "decoder.mask_token")
     if "decoder_pos_embed" in name:
         name = name.replace("decoder_pos_embed", "decoder.decoder_pos_embed")
     if "pos_embed" in name and "decoder" not in name:
@@ -99,7 +97,7 @@ def rename_key(name):
         name = name.replace("norm.weight", "videomae.layernorm.weight")
     if "norm.bias" in name and "decoder" not in name and "fc" not in name:
         name = name.replace("norm.bias", "videomae.layernorm.bias")
-    if "head" in name:
+    if "head" in name and "decoder" not in name:
         name = name.replace("head", "classifier")
 
     return name
@@ -108,19 +106,16 @@ def rename_key(name):
 def convert_state_dict(orig_state_dict, config):
     for key in orig_state_dict.copy().keys():
         val = orig_state_dict.pop(key)
-        
+
         if key.startswith("encoder."):
             key = key.replace("encoder.", "")
-        
+
         if "qkv" in key:
             key_split = key.split(".")
-            print("Key:", key)
             if key.startswith("decoder.blocks"):
                 dim = config.decoder_hidden_size
                 layer_num = int(key_split[2])
                 prefix = "decoder.decoder_layers."
-                print("Old name:", key)
-                print("New name:", f"{prefix}{layer_num}.attention.attention.query.weight")
                 if "weight" in key:
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 68ab7a9fcb6d..a0dbbafc6474 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -544,6 +544,8 @@ def __init__(self, config):
         self.embeddings = VideoMAEEmbeddings(config)
         self.encoder = VideoMAEEncoder(config)
 
+        print("Creating layernorm:", config.use_mean_pooling)
+        
         self.layernorm = (
             nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         )
@@ -641,8 +643,8 @@ def __init__(self, config, num_patches):
             [VideoMAELayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
         )
 
-        self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size)
-        self.decoder_pred = (
+        self.norm = nn.LayerNorm(config.decoder_hidden_size)
+        self.head = (
             nn.Linear(config.decoder_hidden_size, decoder_num_labels) if decoder_num_labels > 0 else nn.Identity()
         )
 
@@ -692,8 +694,8 @@ def custom_forward(*inputs):
             hidden_states = hidden_states[:, -return_token_num:]
 
         # predictor projection
-        hidden_states = self.decoder_norm(hidden_states)
-        logits = self.decoder_pred(hidden_states)
+        hidden_states = self.norm(hidden_states)
+        logits = self.head(hidden_states)
 
         if not return_dict:
             return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)

From b42d2ffb118717233b381da73ee4a17d940e5eb2 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 7 Jul 2022 14:45:22 +0200
Subject: [PATCH 26/42] Add conversion of pretrained model

---
 .../videomae/convert_videomae_to_pytorch.py   | 23 +++++++++++++++----
 .../models/videomae/modeling_videomae.py      | 14 ++++++-----
 2 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 6a28b3ae5dab..d58fdca07d7f 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -38,8 +38,11 @@ def get_videomae_config(checkpoint_path, model_name):
         config.num_hidden_layers = 24
         config.num_attention_heads = 16
 
-    repo_id = "datasets/huggingface/label-files"
+    if "finetuned" not in model_name:
+        config.use_mean_pooling = False
+
     if "finetuned" in model_name:
+        repo_id = "datasets/huggingface/label-files"
         if "kinetics" in model_name:
             config.num_labels = 400
             filename = "kinetics400-id2label.json"
@@ -170,10 +173,15 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     model.load_state_dict(new_state_dict)
     model.eval()
 
-    # forward pass
+    # verify model on basic input
     feature_extractor = VideoMAEFeatureExtractor()
     video = prepare_video()
     inputs = feature_extractor(video, return_tensors="pt")
+
+    if "finetuned" not in model_name:
+        local_path = hf_hub_download(repo_id="nielsr/bool-masked-pos", filename="bool_masked_pos.pt")
+        inputs["bool_masked_pos"] = torch.load(local_path)
+
     outputs = model(**inputs)
     logits = outputs.logits
 
@@ -196,13 +204,20 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     if model_name not in model_names:
         raise ValueError("Model name not supported.")
 
-    if model_name == "videomae-base-finetuned-kinetics":
+    if model_name == "videomae-base-short":
+        expected_slice = torch.tensor(
+            [[-0.4798, -0.3191, -0.2558], [-0.3396, -0.2823, -0.1581], [0.4327, 0.4635, 0.4745]]
+        )
+    elif model_name == "videomae-base-finetuned-kinetics":
         expected_slice = torch.tensor([0.7666, -0.2265, -0.5551])
     elif model_name == "videomae-base-finetuned-ssv2":
         expected_slice = torch.tensor([-0.1354, -0.4494, -0.4979])
 
     # verify logits
-    assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
+    if "finetuned" in model_name:
+        assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
+    else:
+        assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
     print("Logits ok!")
 
     if pytorch_dump_folder_path is not None:
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index a0dbbafc6474..448a9cb451ce 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -544,11 +544,10 @@ def __init__(self, config):
         self.embeddings = VideoMAEEmbeddings(config)
         self.encoder = VideoMAEEncoder(config)
 
-        print("Creating layernorm:", config.use_mean_pooling)
-        
-        self.layernorm = (
-            nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        )
+        if config.use_mean_pooling:
+            self.layernorm = None
+        else:
+            self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -616,7 +615,8 @@ def forward(
             return_dict=return_dict,
         )
         sequence_output = encoder_outputs[0]
-        sequence_output = self.layernorm(sequence_output)
+        if self.layernorm is not None:
+            sequence_output = self.layernorm(sequence_output)
 
         if not return_dict:
             return (sequence_output,) + encoder_outputs[1:]
@@ -792,6 +792,8 @@ def forward(
         )  # [batch_size, num_masked_patches, num_channels * patch_size * patch_size]
         logits = decoder_outputs.logits
 
+        print("Shape of reconstruction:", logits.shape)
+
         # TODO verify loss computation
         loss = None
         with torch.no_grad():

From 099b3f3e9422c7ec3638cc707966c6890c375de2 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 7 Jul 2022 15:13:47 +0200
Subject: [PATCH 27/42] Add loss verification of pretrained model

---
 .../models/videomae/convert_videomae_to_pytorch.py  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index d58fdca07d7f..7444c7885959 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -31,7 +31,7 @@
 
 def get_videomae_config(checkpoint_path, model_name):
     config = VideoMAEConfig()
-
+    
     if "large" in checkpoint_path:
         config.hidden_size = 1024
         config.intermediate_size = 4096
@@ -205,21 +205,32 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
         raise ValueError("Model name not supported.")
 
     if model_name == "videomae-base-short":
+        expected_shape = torch.Size([1, 1408, 1536])
         expected_slice = torch.tensor(
             [[-0.4798, -0.3191, -0.2558], [-0.3396, -0.2823, -0.1581], [0.4327, 0.4635, 0.4745]]
         )
+        expected_loss = torch.tensor([0.5379046201705933])
     elif model_name == "videomae-base-finetuned-kinetics":
+        expected_shape = torch.Size([1, 400])
         expected_slice = torch.tensor([0.7666, -0.2265, -0.5551])
     elif model_name == "videomae-base-finetuned-ssv2":
+        expected_shape = torch.Size([1, 74])
         expected_slice = torch.tensor([-0.1354, -0.4494, -0.4979])
 
     # verify logits
+    assert logits.shape == expected_shape
     if "finetuned" in model_name:
         assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
     else:
         assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
     print("Logits ok!")
 
+    # verify loss, if applicable
+    if "finetuned" not in model_name:
+        loss = outputs.loss
+        assert torch.allclose(loss, expected_loss, atol=1e-4)
+        print("Loss ok!")
+
     if pytorch_dump_folder_path is not None:
         print(f"Saving model and feature extractor to {pytorch_dump_folder_path}")
         feature_extractor.save_pretrained(pytorch_dump_folder_path)

From ab40b5f6cdc43efdfc55243f9e8d976bef4df183 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 7 Jul 2022 15:19:43 +0200
Subject: [PATCH 28/42] Add loss verification of unnormalized targets

---
 .../models/videomae/convert_videomae_to_pytorch.py        | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 7444c7885959..77ba66f512f1 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -31,7 +31,7 @@
 
 def get_videomae_config(checkpoint_path, model_name):
     config = VideoMAEConfig()
-    
+
     if "large" in checkpoint_path:
         config.hidden_size = 1024
         config.intermediate_size = 4096
@@ -209,7 +209,10 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
         expected_slice = torch.tensor(
             [[-0.4798, -0.3191, -0.2558], [-0.3396, -0.2823, -0.1581], [0.4327, 0.4635, 0.4745]]
         )
-        expected_loss = torch.tensor([0.5379046201705933])
+        # we verified the loss both for normalized and unnormalized targets for this one
+        expected_loss = (
+            torch.tensor([0.5379046201705933]) if config.norm_pix_loss else torch.tensor([0.593469500541687])
+        )
     elif model_name == "videomae-base-finetuned-kinetics":
         expected_shape = torch.Size([1, 400])
         expected_slice = torch.tensor([0.7666, -0.2265, -0.5551])
@@ -228,6 +231,7 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     # verify loss, if applicable
     if "finetuned" not in model_name:
         loss = outputs.loss
+        print("Loss:", loss)
         assert torch.allclose(loss, expected_loss, atol=1e-4)
         print("Loss ok!")
 

From 3a523d87d3ad71cc5fdf05c3f58643714eee11f5 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 7 Jul 2022 16:06:44 +0200
Subject: [PATCH 29/42] Add integration test for pretraining model

---
 .../models/videomae/test_modeling_videomae.py | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index bdeedcb8e0ed..1e866bb44e2f 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -375,3 +375,34 @@ def test_inference_for_video_classification(self):
         expected_slice = torch.tensor([0.7666, -0.2265, -0.5551]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
+
+    @slow
+    def test_inference_for_pretraining(self):
+        # TODO update to appropriate organization
+        model = VideoMAEForPreTraining.from_pretrained("nielsr/videomae-base-short").to(torch_device)
+
+        feature_extractor = self.default_feature_extractor
+        video = prepare_video()
+        inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
+
+        # add boolean mask, indicating which patches to mask
+        local_path = hf_hub_download(repo_id="nielsr/bool-masked-pos", filename="bool_masked_pos.pt")
+        inputs["bool_masked_pos"] = torch.load(local_path)
+
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        # verify the logits
+        expected_shape = torch.Size([1, 1408, 1536])
+        expected_slice = torch.tensor(
+            [[-0.4798, -0.3191, -0.2558], [-0.3396, -0.2823, -0.1581], [0.4327, 0.4635, 0.4745]]
+        )
+        self.assertEqual(outputs.logits.shape, expected_shape)
+        self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4))
+
+        # verify the loss
+        expected_loss = (
+            torch.tensor([0.5379046201705933]) if model.config.norm_pix_loss else torch.tensor([0.593469500541687])
+        )
+        self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))

From 7e22b9858f94c1ca299507151d64a03eca78d3f9 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 8 Jul 2022 16:37:26 +0200
Subject: [PATCH 30/42] Apply suggestions from code review

---
 docs/source/en/model_doc/auto.mdx             |  4 +++
 src/transformers/__init__.py                  |  2 ++
 src/transformers/models/auto/__init__.py      |  2 ++
 .../models/videomae/modeling_videomae.py      | 34 ++++++++-----------
 src/transformers/utils/dummy_pt_objects.py    |  7 ++++
 5 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/docs/source/en/model_doc/auto.mdx b/docs/source/en/model_doc/auto.mdx
index 6c3216638961..67fc81d280a7 100644
--- a/docs/source/en/model_doc/auto.mdx
+++ b/docs/source/en/model_doc/auto.mdx
@@ -118,6 +118,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
 
 [[autodoc]] AutoModelForImageClassification
 
+## AutoModelForVideoClassification
+
+[[autodoc]] AutoModelForVideoClassification
+
 ## AutoModelForVision2Seq
 
 [[autodoc]] AutoModelForVision2Seq
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index de9868d55ffd..e8cfd47f3d3b 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -828,6 +828,7 @@
             "AutoModelForSpeechSeq2Seq",
             "AutoModelForTableQuestionAnswering",
             "AutoModelForTokenClassification",
+            "AutoModelForVideoClassification",
             "AutoModelForVision2Seq",
             "AutoModelForVisualQuestionAnswering",
             "AutoModelWithLMHead",
@@ -3538,6 +3539,7 @@
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
+            AutoModelForVideoClassification,
             AutoModelForVision2Seq,
             AutoModelForVisualQuestionAnswering,
             AutoModelWithLMHead,
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index 4706d90db0c4..b04c2420ef96 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -90,6 +90,7 @@
         "AutoModelForSpeechSeq2Seq",
         "AutoModelForTableQuestionAnswering",
         "AutoModelForTokenClassification",
+        "AutoModelForVideoClassification",
         "AutoModelForVision2Seq",
         "AutoModelForVisualQuestionAnswering",
         "AutoModelWithLMHead",
@@ -231,6 +232,7 @@
             AutoModelForSpeechSeq2Seq,
             AutoModelForTableQuestionAnswering,
             AutoModelForTokenClassification,
+            AutoModelForVideoClassification,
             AutoModelForVision2Seq,
             AutoModelForVisualQuestionAnswering,
             AutoModelWithLMHead,
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 448a9cb451ce..e8428248ddbf 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -161,16 +161,15 @@ class VideoMAEPatchEmbeddings(nn.Module):
     """
 
     def __init__(self, config):
-        image_size, patch_size, num_channels, hidden_size, num_frames, tubelet_size = (
-            config.image_size,
-            config.patch_size,
-            config.num_channels,
-            config.hidden_size,
-            config.num_frames,
-            config.tubelet_size,
-        )
-
         super().__init__()
+
+        image_size = config.image_size
+        patch_size = config.patch_size
+        num_channels = config.num_channels
+        hidden_size = config.hidden_size
+        num_frames = config.num_frames
+        tubelet_size = config.tubelet_size
+
         image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
         patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
         self.image_size = image_size
@@ -783,18 +782,13 @@ def forward(
         pos_emb_visible = expanded_position_embeddings[~bool_masked_pos].reshape(batch_size, -1, num_channels)
         pos_emb_mask = expanded_position_embeddings[bool_masked_pos].reshape(batch_size, -1, num_channels)
 
-        x_full = torch.cat(
-            [sequence_output + pos_emb_visible, self.mask_token + pos_emb_mask], dim=1
-        )  # [batch_size, num_patches, decoder_hidden_size]
+        # [batch_size, num_patches, decoder_hidden_size]
+        x_full = torch.cat([sequence_output + pos_emb_visible, self.mask_token + pos_emb_mask], dim=1)
 
-        decoder_outputs = self.decoder(
-            x_full, pos_emb_mask.shape[1]
-        )  # [batch_size, num_masked_patches, num_channels * patch_size * patch_size]
+        # [batch_size, num_masked_patches, num_channels * patch_size * patch_size]
+        decoder_outputs = self.decoder(x_full, pos_emb_mask.shape[1])
         logits = decoder_outputs.logits
 
-        print("Shape of reconstruction:", logits.shape)
-
-        # TODO verify loss computation
         loss = None
         with torch.no_grad():
             # calculate the labels to be predicted
@@ -818,9 +812,9 @@ def forward(
                     width // patch_size,
                     patch_size,
                 )
-                # step 2: move dimensions to concatenate: (batch_size, T//ts, H//ps, W//ps, ts, ps, ps, C)
+                # step 2: move dimensions to concatenate:
                 frames = frames.permute(0, 1, 4, 6, 2, 5, 7, 3).contiguous()
-                # step 3: concatenate: (batch_size, T//ts, H//bs, W//bs, ts*bs*bs, C)
+                # step 3: concatenate:
                 frames = frames.view(
                     batch_size,
                     time // tubelet_size * height // patch_size * width // patch_size,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 0cfe20c21baf..d636be655af2 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -575,6 +575,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class AutoModelForVideoClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class AutoModelForVision2Seq(metaclass=DummyObject):
     _backends = ["torch"]
 

From 0f0beb8f7b018ccd94d183f57ff2598e58717ca6 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 8 Jul 2022 17:29:24 +0200
Subject: [PATCH 31/42] Fix bug to make feature extractor resize only shorter
 edge

---
 .../videomae/convert_videomae_to_pytorch.py   | 16 +++++---------
 .../videomae/feature_extraction_videomae.py   | 19 +++++++++--------
 src/transformers/models/videomae/test.py      | 10 ++++++---
 .../models/videomae/test_modeling_videomae.py | 21 ++++++++++++-------
 4 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 77ba66f512f1..0b3dc914f810 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -185,8 +185,6 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     outputs = model(**inputs)
     logits = outputs.logits
 
-    print("Shape of logits:", logits.shape)
-
     model_names = [
         # Kinetics-400 checkpoints (short = pretrained only for 800 epochs instead of 1600)
         "videomae-base-short",
@@ -206,19 +204,15 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
 
     if model_name == "videomae-base-short":
         expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor(
-            [[-0.4798, -0.3191, -0.2558], [-0.3396, -0.2823, -0.1581], [0.4327, 0.4635, 0.4745]]
-        )
+        expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
         # we verified the loss both for normalized and unnormalized targets for this one
-        expected_loss = (
-            torch.tensor([0.5379046201705933]) if config.norm_pix_loss else torch.tensor([0.593469500541687])
-        )
+        expected_loss = torch.tensor([0.5142]) if config.norm_pix_loss else torch.tensor([0.6469])
     elif model_name == "videomae-base-finetuned-kinetics":
         expected_shape = torch.Size([1, 400])
-        expected_slice = torch.tensor([0.7666, -0.2265, -0.5551])
+        expected_slice = torch.tensor([0.3669, -0.0688, -0.2421])
     elif model_name == "videomae-base-finetuned-ssv2":
-        expected_shape = torch.Size([1, 74])
-        expected_slice = torch.tensor([-0.1354, -0.4494, -0.4979])
+        expected_shape = torch.Size([1, 174])
+        expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266])
 
     # verify logits
     assert logits.shape == expected_shape
diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index 3e808aeb52ba..86f7ee6ce472 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -42,11 +42,11 @@ class VideoMAEFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
 
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to resize the input to a certain `size`.
+            Whether to resize the shorter edge of the input to a certain `size`.
         size (`int` or `Tuple(int)`, *optional*, defaults to 224):
-            Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize` is
-            set to `True`.
+            Resize the shorter edge of the input to the given size. If a tuple is provided, it should be (width,
+            height). If only an integer is provided, then the input will be resized to (size, size). Only has an effect
+            if `do_resize` is set to `True`.
         resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
             An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
             `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
@@ -84,7 +84,7 @@ def __init__(
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
 
     def resize_video(self, video, size, resample="bilinear"):
-        return [self.resize(frame, size, resample) for frame in video]
+        return [self.resize(frame, size, resample, default_to_square=False) for frame in video]
 
     def crop_video(self, video, size):
         return [self.center_crop(frame, size) for frame in video]
@@ -105,10 +105,11 @@ def __call__(
         </Tip>
 
         Args:
-            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`):
-                The video or batch of videos to be prepared. Each video should be a list of frames, which can be either
-                PIL images or NumPy arrays. In case of a NumPy array, each frame should be of shape (H, W, C), where H
-                and W are frame height and width, and C is a number of channels.
+            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
+                `List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
+                of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
+                each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
+                channels.
 
             return_tensors (`str` or [`~utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
diff --git a/src/transformers/models/videomae/test.py b/src/transformers/models/videomae/test.py
index e4c0fc91f77e..5247c4f6f009 100644
--- a/src/transformers/models/videomae/test.py
+++ b/src/transformers/models/videomae/test.py
@@ -1,5 +1,5 @@
-# import torch
 import numpy as np
+import torch
 
 from transformers import VideoMAEFeatureExtractor
 
@@ -10,8 +10,12 @@
 
 video = [np.random.rand(512, 640, 3), np.random.rand(312, 200, 3)]
 
-video = np.random.rand(16, 360, 640, 3)
-video = [video[i] for i in range(video.shape[0])]
+video = [np.random.rand(3, 512, 640), np.random.rand(3, 312, 200)]
+
+video = [torch.randn(3, 512, 640), torch.rand(3, 312, 200)]
+
+# video = np.random.rand(16, 360, 640, 3)
+# video = [video[i] for i in range(video.shape[0])]
 
 encoding = feature_extractor(video, return_tensors="pt")
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 1e866bb44e2f..ddbee5ff6a6b 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -372,7 +372,7 @@ def test_inference_for_video_classification(self):
         expected_shape = torch.Size((1, 400))
         self.assertEqual(outputs.logits.shape, expected_shape)
 
-        expected_slice = torch.tensor([0.7666, -0.2265, -0.5551]).to(torch_device)
+        expected_slice = torch.tensor([0.3669, -0.0688, -0.2421]).to(torch_device)
 
         self.assertTrue(torch.allclose(outputs.logits[0, :3], expected_slice, atol=1e-4))
 
@@ -395,14 +395,21 @@ def test_inference_for_pretraining(self):
 
         # verify the logits
         expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor(
-            [[-0.4798, -0.3191, -0.2558], [-0.3396, -0.2823, -0.1581], [0.4327, 0.4635, 0.4745]]
-        )
+        expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
         self.assertEqual(outputs.logits.shape, expected_shape)
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4))
 
-        # verify the loss
-        expected_loss = (
-            torch.tensor([0.5379046201705933]) if model.config.norm_pix_loss else torch.tensor([0.593469500541687])
+        # verify the loss (`config.norm_pix_loss` = `True`)
+        expected_loss = torch.tensor([0.5142])
+        self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
+
+        # verify the loss (`config.norm_pix_loss` = `False`)
+        model = VideoMAEForPreTraining.from_pretrained("nielsr/videomae-base-short", norm_pix_loss=False).to(
+            torch_device
         )
+
+        with torch.no_grad():
+            outputs = model(**inputs)
+
+        expected_loss = torch.tensor(torch.tensor([0.6469]))
         self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))

From d899dbe9130fc0d539c8b4af6734a79cf64e247f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Fri, 8 Jul 2022 17:43:20 +0200
Subject: [PATCH 32/42] Address more comments

---
 README_ko.md                                           |  2 +-
 README_zh-hans.md                                      |  2 +-
 README_zh-hant.md                                      |  2 +-
 .../models/videomae/convert_videomae_to_pytorch.py     | 10 ----------
 4 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/README_ko.md b/README_ko.md
index 21471ae03f60..c63fdca749da 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -324,7 +324,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index f945a82a467e..0ab06bd96ad9 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -348,7 +348,7 @@ conda install -c huggingface transformers
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
-1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (来自 Multimedia Computing Group, Nanjing University) 伴随论文 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 由 Zhan Tong, Yibing Song, Jue Wang, Limin Wang 发布。
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 7273fd53171d..90f29ad031b8 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -360,7 +360,7 @@ conda install -c huggingface transformers
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from <FILL INSTITUTION>) released with the paper [<FILL PAPER TITLE>](<FILL ARKIV LINK>) by <FILL AUTHORS>. 
+1. **[VideoMAE](https://huggingface.co/docs/transformers/main/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 0b3dc914f810..6147ce5c9446 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -123,10 +123,6 @@ def convert_state_dict(orig_state_dict, config):
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                # elif "bias" in key:
-                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
             else:
                 dim = config.hidden_size
                 layer_num = int(key_split[1])
@@ -135,12 +131,6 @@ def convert_state_dict(orig_state_dict, config):
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.weight"] = val[:dim, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.weight"] = val[dim : dim * 2, :]
                     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.weight"] = val[-dim:, :]
-                # elif "bias" in key:
-                #     print("hello we're here")
-                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.query.bias"] = val[:dim]
-                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.key.bias"] = val[dim : dim * 2]
-                #     orig_state_dict[f"{prefix}{layer_num}.attention.attention.value.bias"] = val[-dim:]
-
         else:
             orig_state_dict[rename_key(key)] = val
 

From e1e658de83052fa383622a9b705ee279b736c1bb Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Sat, 9 Jul 2022 14:05:43 +0200
Subject: [PATCH 33/42] Improve normalization of videos

---
 .../videomae/feature_extraction_videomae.py       | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index 86f7ee6ce472..63308458df58 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -90,7 +90,20 @@ def crop_video(self, video, size):
         return [self.center_crop(frame, size) for frame in video]
 
     def normalize_video(self, video, mean, std):
-        return [self.normalize(frame, mean, std) for frame in video]
+        # video can be a list of PIL images, list of NumPy arrays or list of PyTorch tensors
+        # first: convert to list of NumPy arrays
+        video = [self.to_numpy_array(frame) for frame in video]
+
+        # second: stack to get (num_frames, num_channels, height, width)
+        video = np.stack(video, axis=0)
+
+        # third: normalize
+        if not isinstance(mean, np.ndarray):
+            mean = np.array(mean).astype(video.dtype)
+        if not isinstance(std, np.ndarray):
+            std = np.array(std).astype(video.dtype)
+
+        return (video - mean[None, :, None, None]) / std[None, :, None, None]
 
     def __call__(
         self, videos: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs

From 98ee976e3af7007abffbccad1e1a11b56747827f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 1 Aug 2022 17:06:06 +0200
Subject: [PATCH 34/42] Add doc examples

---
 .../models/videomae/modeling_videomae.py      | 53 ++++++++++++++-----
 1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index e8428248ddbf..557bac43b667 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -579,15 +579,39 @@ def forward(
         Examples:
 
         ```python
+        >>> from decord import VideoReader, cpu
+        >>> import numpy as np
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEModel
-        >>> import torch
 
-        >>> video = torch.randn(1, 16, 3, 224, 224)
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base")
-        >>> model = VideoMAEModel.from_pretrained("nanjing/videomae-base")
+        >>> def sample_frame_indices(clip_len, frame_sample_rate):
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(index, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> video_url = "https://huggingface.co/datasets/nielsr/video-demo/resolve/main/eating_spaghetti.mp4"
+        >>> vr = VideoReader(video_url, num_threads=1, ctx=cpu(0))
+
+        >>> # sample 16 frames
+        >>> vr.seek(0)
+        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4)
+        >>> buffer = vr.get_batch(indices).asnumpy()
+
+        >>> # create a list of NumPy arrays
+        >>> video = [buffer[i] for i in range(buffer.shape[0])]
 
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nielsr/videomae-base")
+        >>> model = VideoMAEModel.from_pretrained("nielsr/videomae-base")
+
+        >>> # prepare video for the model
         >>> inputs = feature_extractor(video, return_tensors="pt")
+
+        >>> # forward pass
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
@@ -745,14 +769,19 @@ def forward(
         ```python
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForPreTraining
         >>> import numpy as np
+        >>> import torch
 
-        >>> video = np.random.randn(16, 3, 224, 224).tolist()
+        >>> num_frames = 16
+        >>> video = list(np.random.randn(16, 3, 224, 224))
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/vit-mae-base")
-        >>> model = VideoMAEForPreTraining.from_pretrained("nanjing/vit-mae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nielsr/videomae-base")
+        >>> model = VideoMAEForPreTraining.from_pretrained("nielsr/videomae-base")
 
         >>> pixel_values = feature_extractor(video, return_tensors="pt").pixel_values
-        >>> bool_masked_pos = ...
+
+        >>> num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
+        >>> seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
+        >>> bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()
 
         >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
         >>> loss = outputs.loss
@@ -914,14 +943,14 @@ def forward(
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
         >>> import numpy as np
 
-        >>> video = np.random.randn(16, 3, 224, 224).tolist()
+        >>> video = list(np.random.randn(16, 3, 224, 224))
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nanjing/videomae-base")
-        >>> model = VideoMAEForVideoClassification.from_pretrained("nanjing/videomae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nielsr/videomae-base")
+        >>> model = VideoMAEForVideoClassification.from_pretrained("nielsr/videomae-base")
 
         >>> inputs = feature_extractor(video, return_tensors="pt")
         >>> outputs = model(**inputs)
-        >>> last_hidden_states = outputs.last_hidden_state
+        >>> logits = outputs.logits
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 

From ae2a3ee4b7ce58ce8f96f752b22fc2871bb62bff Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 1 Aug 2022 18:33:46 +0200
Subject: [PATCH 35/42] Move constants to dedicated script

---
 src/transformers/image_utils.py                     | 13 +++++++++----
 .../models/videomae/modeling_videomae.py            |  2 +-
 src/transformers/utils/__init__.py                  |  1 +
 src/transformers/utils/constants.py                 |  4 ++++
 4 files changed, 15 insertions(+), 5 deletions(-)
 create mode 100644 src/transformers/utils/constants.py

diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index ddef7a3a777e..45fdf9585ee5 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -23,13 +23,18 @@
 import requests
 
 from .utils import is_torch_available
+from .utils.constants import (  # noqa: F401
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+)
 from .utils.generic import _is_torch
 
 
-IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
-IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
-IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
-IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
+IMAGENET_DEFAULT_MEAN = IMAGENET_DEFAULT_MEAN
+IMAGENET_DEFAULT_STD = IMAGENET_DEFAULT_STD
+
 
 ImageInput = Union[
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 557bac43b667..9e0f5f619c67 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -28,7 +28,6 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...image_utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from ...modeling_outputs import BaseModelOutput, ImageClassifierOutput
 from ...modeling_utils import PreTrainedModel
 from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
@@ -39,6 +38,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
 from .configuration_videomae import VideoMAEConfig
 
 
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index 2334c351cc51..377932e2d490 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -22,6 +22,7 @@
 from packaging import version
 
 from .. import __version__
+from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD
 from .doc import (
     add_code_sample_docstrings,
     add_end_docstrings,
diff --git a/src/transformers/utils/constants.py b/src/transformers/utils/constants.py
new file mode 100644
index 000000000000..cb18cd3482b2
--- /dev/null
+++ b/src/transformers/utils/constants.py
@@ -0,0 +1,4 @@
+IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
+IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
+IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
\ No newline at end of file

From 97229e27c5d35f7213149a4401cadef72e55802c Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Mon, 1 Aug 2022 19:24:13 +0200
Subject: [PATCH 36/42] Remove scripts

---
 .../models/videomae/modeling_videomae.py      |  6 +-
 src/transformers/models/videomae/test.py      | 22 --------
 .../models/videomae/test_model.py             | 55 -------------------
 src/transformers/utils/constants.py           |  2 +-
 4 files changed, 2 insertions(+), 83 deletions(-)
 delete mode 100644 src/transformers/models/videomae/test.py
 delete mode 100644 src/transformers/models/videomae/test_model.py

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 9e0f5f619c67..f50bd4d5a435 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -722,11 +722,7 @@ def custom_forward(*inputs):
 
         if not return_dict:
             return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
-        return VideoMAEDecoderOutput(
-            logits=logits,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-        )
+        return VideoMAEDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
 
 
 @add_start_docstrings(
diff --git a/src/transformers/models/videomae/test.py b/src/transformers/models/videomae/test.py
deleted file mode 100644
index 5247c4f6f009..000000000000
--- a/src/transformers/models/videomae/test.py
+++ /dev/null
@@ -1,22 +0,0 @@
-import numpy as np
-import torch
-
-from transformers import VideoMAEFeatureExtractor
-
-
-# test feature extractor
-
-feature_extractor = VideoMAEFeatureExtractor()
-
-video = [np.random.rand(512, 640, 3), np.random.rand(312, 200, 3)]
-
-video = [np.random.rand(3, 512, 640), np.random.rand(3, 312, 200)]
-
-video = [torch.randn(3, 512, 640), torch.rand(3, 312, 200)]
-
-# video = np.random.rand(16, 360, 640, 3)
-# video = [video[i] for i in range(video.shape[0])]
-
-encoding = feature_extractor(video, return_tensors="pt")
-
-print(encoding.pixel_values.shape)
diff --git a/src/transformers/models/videomae/test_model.py b/src/transformers/models/videomae/test_model.py
deleted file mode 100644
index e47a8a760fa8..000000000000
--- a/src/transformers/models/videomae/test_model.py
+++ /dev/null
@@ -1,55 +0,0 @@
-import numpy as np
-import torch
-
-from transformers import VideoMAEConfig, VideoMAEForPreTraining
-
-
-class TubeMaskingGenerator:
-    def __init__(self, input_size, mask_ratio):
-        self.frames, self.height, self.width = input_size
-        self.num_patches_per_frame = self.height * self.width
-        self.total_patches = self.frames * self.num_patches_per_frame
-        self.num_masks_per_frame = int(mask_ratio * self.num_patches_per_frame)
-        self.total_masks = self.frames * self.num_masks_per_frame
-
-    def __repr__(self):
-        repr_str = "Maks: total patches {}, mask patches {}".format(self.total_patches, self.total_masks)
-        return repr_str
-
-    def __call__(self):
-        mask_per_frame = np.hstack(
-            [
-                np.zeros(self.num_patches_per_frame - self.num_masks_per_frame),
-                np.ones(self.num_masks_per_frame),
-            ]
-        )
-        np.random.shuffle(mask_per_frame)
-        mask = np.tile(mask_per_frame, (self.frames, 1)).flatten()
-        return mask
-
-
-num_frames = 16
-input_size = 224
-patch_size = (16, 16)
-window_size = (num_frames // 2, input_size // patch_size[0], input_size // patch_size[1])
-
-masked_position_generator = TubeMaskingGenerator(input_size=window_size, mask_ratio=0.9)
-
-
-# test model
-
-model = VideoMAEForPreTraining(VideoMAEConfig(norm_pix_loss=True))
-
-pixel_values = torch.randn(1, 16, 3, 224, 224)
-
-bool_masked_pos = masked_position_generator()
-print("Shape of bool masked pos:", bool_masked_pos.shape)
-print("Number of masked frames:", np.sum(bool_masked_pos))
-
-bool_masked_pos = torch.from_numpy(bool_masked_pos)
-bool_masked_pos = bool_masked_pos.unsqueeze(0)
-bool_masked_pos = bool_masked_pos.flatten(1).to(torch.bool)
-
-outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
-
-print(outputs.logits.shape)
diff --git a/src/transformers/utils/constants.py b/src/transformers/utils/constants.py
index cb18cd3482b2..af2e48ab0a8b 100644
--- a/src/transformers/utils/constants.py
+++ b/src/transformers/utils/constants.py
@@ -1,4 +1,4 @@
 IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
 IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
 IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
-IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
\ No newline at end of file
+IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]

From 8ef0d69dce764acdaa3b247e31878011e02c5d20 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 2 Aug 2022 15:10:46 +0200
Subject: [PATCH 37/42] Transfer checkpoints, fix docs

---
 docs/source/en/_toctree.yml                   |  2 ++
 docs/source/en/model_doc/videomae.mdx         | 11 ++++++----
 src/transformers/image_utils.py               |  4 ----
 .../models/videomae/configuration_videomae.py | 21 +++++++++----------
 .../videomae/feature_extraction_videomae.py   |  8 +++----
 .../models/videomae/modeling_videomae.py      | 17 +++++++--------
 .../models/videomae/test_modeling_videomae.py | 13 +++++-------
 7 files changed, 35 insertions(+), 41 deletions(-)

diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index f73ad8e035b3..bee6007a545c 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -385,6 +385,8 @@
         title: Swin Transformer V2
       - local: model_doc/van
         title: VAN
+      - local: model_doc/videomae
+        title: VideoMAE
       - local: model_doc/vit
         title: Vision Transformer (ViT)
       - local: model_doc/vit_mae
diff --git a/docs/source/en/model_doc/videomae.mdx b/docs/source/en/model_doc/videomae.mdx
index 72ec84202105..c319944dc8ed 100644
--- a/docs/source/en/model_doc/videomae.mdx
+++ b/docs/source/en/model_doc/videomae.mdx
@@ -23,7 +23,13 @@ The abstract from the paper is the following:
 
 Tips:
 
-- One can use [`VideoMAEFeatureExtractor`] to prepare videos for the model.
+- One can use [`VideoMAEFeatureExtractor`] to prepare videos for the model. It will resize + normalize all frames of a video for you.
+- [`VideoMAEForPreTraining`] includes the decoder on top for self-supervised pre-training.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/videomae_architecture.jpeg"
+alt="drawing" width="600"/> 
+
+<small> VideoMAE pre-training. Taken from the <a href="https://arxiv.org/abs/2203.12602">original paper</a>. </small>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/MCG-NJU/VideoMAE).
@@ -33,7 +39,6 @@ The original code can be found [here](https://github.com/MCG-NJU/VideoMAE).
 
 [[autodoc]] VideoMAEConfig
 
-
 ## VideoMAEFeatureExtractor
 
 [[autodoc]] VideoMAEFeatureExtractor
@@ -44,13 +49,11 @@ The original code can be found [here](https://github.com/MCG-NJU/VideoMAE).
 [[autodoc]] VideoMAEModel
     - forward
 
-
 ## VideoMAEForPreTraining
 
 [[autodoc]] transformers.VideoMAEForPreTraining
     - forward
 
-
 ## VideoMAEForVideoClassification
 
 [[autodoc]] transformers.VideoMAEForVideoClassification
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index 45fdf9585ee5..dd7bb326993d 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -32,10 +32,6 @@
 from .utils.generic import _is_torch
 
 
-IMAGENET_DEFAULT_MEAN = IMAGENET_DEFAULT_MEAN
-IMAGENET_DEFAULT_STD = IMAGENET_DEFAULT_STD
-
-
 ImageInput = Union[
     PIL.Image.Image, np.ndarray, "torch.Tensor", List[PIL.Image.Image], List[np.ndarray], List["torch.Tensor"]  # noqa
 ]
diff --git a/src/transformers/models/videomae/configuration_videomae.py b/src/transformers/models/videomae/configuration_videomae.py
index 97a30e9003b8..932c4c1d98ca 100644
--- a/src/transformers/models/videomae/configuration_videomae.py
+++ b/src/transformers/models/videomae/configuration_videomae.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ViT MAE model configuration"""
+""" VideoMAE model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -21,21 +21,20 @@
 logger = logging.get_logger(__name__)
 
 VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "nanjing/videomae-base": "https://huggingface.co/nanjing/videomae-base/resolve/main/config.json",
+    "MCG-NJU/videomae-base": "https://huggingface.co/MCG-NJU/videomae-base/resolve/main/config.json",
 }
 
 
 class VideoMAEConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`VideoMAEModel`]. It is used to instantiate an ViT
-    MAE model according to the specified arguments, defining the model architecture. Instantiating a configuration with
-    the defaults will yield a similar configuration to that of the ViT
-    [nanjing/videomae-base](https://huggingface.co/nanjing/videomae-base) architecture.
+    This is the configuration class to store the configuration of a [`VideoMAEModel`]. It is used to instantiate a
+    VideoMAE model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VideoMAE
+    [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) architecture.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
 
-
     Args:
         image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
@@ -84,12 +83,12 @@ class VideoMAEConfig(PretrainedConfig):
     Example:
 
     ```python
-    >>> from transformers import VideoMAEModel, VideoMAEConfig
+    >>> from transformers import VideoMAEConfig, VideoMAEModel
 
-    >>> # Initializing a ViT MAE vit-mae-base style configuration
+    >>> # Initializing a VideoMAE videomae-base style configuration
     >>> configuration = VideoMAEConfig()
 
-    >>> # Initializing a model from the vit-mae-base style configuration
+    >>> # Randomly initializing a model from the configuration
     >>> model = VideoMAEModel(configuration)
 
     >>> # Accessing the model configuration
diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index 63308458df58..4350c56d44de 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -35,7 +35,7 @@
 
 class VideoMAEFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
-    Constructs a VideoMAE feature extractor.
+    Constructs a VideoMAE feature extractor. This feature extractor can be used to prepare videos for the model.
 
     This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods.
@@ -43,10 +43,8 @@ class VideoMAEFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
     Args:
         do_resize (`bool`, *optional*, defaults to `True`):
             Whether to resize the shorter edge of the input to a certain `size`.
-        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
-            Resize the shorter edge of the input to the given size. If a tuple is provided, it should be (width,
-            height). If only an integer is provided, then the input will be resized to (size, size). Only has an effect
-            if `do_resize` is set to `True`.
+        size (`int`, *optional*, defaults to 224):
+            Resize the shorter edge of the input to the given size. Only has an effect if `do_resize` is set to `True`.
         resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
             An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
             `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`. Only has an effect
diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index f50bd4d5a435..0a27565ce369 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -45,11 +45,10 @@
 logger = logging.get_logger(__name__)
 
 _CONFIG_FOR_DOC = "VideoMAEConfig"
-_CHECKPOINT_FOR_DOC = "nanjing/videomae-base"
+_CHECKPOINT_FOR_DOC = "MCG-NJU/videomae-base"
 
 VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    # TODO rename to organization
-    "nielsr/videomae-base",
+    "MCG-NJU/videomae-base",
     # See all VideoMAE models at https://huggingface.co/models?filter=videomae
 ]
 
@@ -605,8 +604,8 @@ def forward(
         >>> # create a list of NumPy arrays
         >>> video = [buffer[i] for i in range(buffer.shape[0])]
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nielsr/videomae-base")
-        >>> model = VideoMAEModel.from_pretrained("nielsr/videomae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
+        >>> model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")
 
         >>> # prepare video for the model
         >>> inputs = feature_extractor(video, return_tensors="pt")
@@ -770,8 +769,8 @@ def forward(
         >>> num_frames = 16
         >>> video = list(np.random.randn(16, 3, 224, 224))
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nielsr/videomae-base")
-        >>> model = VideoMAEForPreTraining.from_pretrained("nielsr/videomae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
+        >>> model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base")
 
         >>> pixel_values = feature_extractor(video, return_tensors="pt").pixel_values
 
@@ -941,8 +940,8 @@ def forward(
 
         >>> video = list(np.random.randn(16, 3, 224, 224))
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("nielsr/videomae-base")
-        >>> model = VideoMAEForVideoClassification.from_pretrained("nielsr/videomae-base")
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
+        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base")
 
         >>> inputs = feature_extractor(video, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index ddbee5ff6a6b..dd8c82b390ee 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -352,13 +352,11 @@ def prepare_video():
 class VideoMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        # TODO update to appropriate organization
-        return VideoMAEFeatureExtractor() if is_vision_available() else None
+        return VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base") if is_vision_available() else None
 
     @slow
     def test_inference_for_video_classification(self):
-        # TODO update to appropriate organization
-        model = VideoMAEForVideoClassification.from_pretrained("nielsr/videomae-base").to(torch_device)
+        model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         video = prepare_video()
@@ -378,15 +376,14 @@ def test_inference_for_video_classification(self):
 
     @slow
     def test_inference_for_pretraining(self):
-        # TODO update to appropriate organization
-        model = VideoMAEForPreTraining.from_pretrained("nielsr/videomae-base-short").to(torch_device)
+        model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short").to(torch_device)
 
         feature_extractor = self.default_feature_extractor
         video = prepare_video()
         inputs = feature_extractor(video, return_tensors="pt").to(torch_device)
 
         # add boolean mask, indicating which patches to mask
-        local_path = hf_hub_download(repo_id="nielsr/bool-masked-pos", filename="bool_masked_pos.pt")
+        local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
         inputs["bool_masked_pos"] = torch.load(local_path)
 
         # forward pass
@@ -404,7 +401,7 @@ def test_inference_for_pretraining(self):
         self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
 
         # verify the loss (`config.norm_pix_loss` = `False`)
-        model = VideoMAEForPreTraining.from_pretrained("nielsr/videomae-base-short", norm_pix_loss=False).to(
+        model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base-short", norm_pix_loss=False).to(
             torch_device
         )
 

From 43b90a70dc0ba3d81d2a6928abea15d49fcc263e Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Tue, 2 Aug 2022 19:13:05 +0200
Subject: [PATCH 38/42] Update script

---
 .../videomae/convert_videomae_to_pytorch.py   | 82 +++++++++++++------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 6147ce5c9446..94333c184572 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -20,6 +20,7 @@
 import numpy as np
 import torch
 
+import gdown
 from huggingface_hub import hf_hub_download
 from transformers import (
     VideoMAEConfig,
@@ -29,14 +30,18 @@
 )
 
 
-def get_videomae_config(checkpoint_path, model_name):
+def get_videomae_config(model_name):
     config = VideoMAEConfig()
 
-    if "large" in checkpoint_path:
+    if "large" in model_name:
         config.hidden_size = 1024
         config.intermediate_size = 4096
         config.num_hidden_layers = 24
         config.num_attention_heads = 16
+        config.decoder_num_hidden_layers = 12
+        config.decoder_num_attention_heads = 8
+        config.decoder_hidden_size = 512
+        config.decoder_intermediate_size = 2048
 
     if "finetuned" not in model_name:
         config.use_mean_pooling = False
@@ -145,19 +150,22 @@ def prepare_video():
     return list(video)
 
 
-def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model_name, push_to_hub):
-    config = get_videomae_config(checkpoint_path, model_name)
+def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_name, push_to_hub):
+    config = get_videomae_config(model_name)
 
     if "finetuned" in model_name:
         model = VideoMAEForVideoClassification(config)
     else:
         model = VideoMAEForPreTraining(config)
 
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-    if "finetuned" in model_name:
-        state_dict = state_dict["module"]
+    # download original checkpoint, hosted on Google Drive
+    output = "pytorch_model.bin"
+    gdown.cached_download(checkpoint_url, output, quiet=False)
+    files = torch.load(output, map_location="cpu")
+    if "model" in files:
+        state_dict = files["model"]
     else:
-        state_dict = state_dict["model"]
+        state_dict = files["module"]
     new_state_dict = convert_state_dict(state_dict, config)
 
     model.load_state_dict(new_state_dict)
@@ -169,7 +177,7 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     inputs = feature_extractor(video, return_tensors="pt")
 
     if "finetuned" not in model_name:
-        local_path = hf_hub_download(repo_id="nielsr/bool-masked-pos", filename="bool_masked_pos.pt")
+        local_path = hf_hub_download(repo_id="hf-internal-testing/bool-masked-pos", filename="bool_masked_pos.pt")
         inputs["bool_masked_pos"] = torch.load(local_path)
 
     outputs = model(**inputs)
@@ -189,33 +197,54 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
         "videomae-base-ssv2",
         "videomae-base-finetuned-ssv2",
     ]
-    if model_name not in model_names:
-        raise ValueError("Model name not supported.")
 
-    if model_name == "videomae-base-short":
+    if model_name == "videomae-base":
+        expected_shape = torch.Size([1, 1408, 1536])
+        expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
+    elif model_name == "videomae-base-short":
         expected_shape = torch.Size([1, 1408, 1536])
         expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
         # we verified the loss both for normalized and unnormalized targets for this one
         expected_loss = torch.tensor([0.5142]) if config.norm_pix_loss else torch.tensor([0.6469])
+    elif model_name == "videomae-large":
+        expected_shape = torch.Size([1, 1408, 1536])
+        expected_slice = torch.tensor([[0.7149, 0.7997, 0.6966], [0.6768, 0.7869, 0.6948], [0.5139, 0.6221, 0.5605]])
+    elif model_name == "videomae-large-finetuned-kinetics":
+        expected_shape = torch.Size([1, 400])
+        expected_slice = torch.tensor([0.0771, 0.0011, -0.3625])
+    elif model_name == "videomae-base-short-finetuned-kinetics":
+        expected_shape = torch.Size([1, 400])
+        expected_slice = torch.tensor([0.6588, 0.0990, -0.2493])
     elif model_name == "videomae-base-finetuned-kinetics":
         expected_shape = torch.Size([1, 400])
         expected_slice = torch.tensor([0.3669, -0.0688, -0.2421])
-    elif model_name == "videomae-base-finetuned-ssv2":
+    elif model_name == "videomae-base-short-ssv2":
+        expected_shape = torch.Size([1, 1408, 1536])
+        expected_slice = torch.tensor([[0.4712, 0.5296, 0.5786], [0.2278, 0.2729, 0.4026], [0.0352, 0.0730, 0.2506]])
+    elif model_name == "videomae-base-short-finetuned-ssv2":
         expected_shape = torch.Size([1, 174])
         expected_slice = torch.tensor([-0.0537, -0.1539, -0.3266])
+    elif model_name == "videomae-base-ssv2":
+        expected_shape = torch.Size([1, 1408, 1536])
+        expected_slice = torch.tensor([[0.8131, 0.8727, 0.8546], [0.7366, 0.9377, 0.8870], [0.5935, 0.8874, 0.8564]])
+    elif model_name == "videomae-base-finetuned-ssv2":
+        expected_shape = torch.Size([1, 174])
+        expected_slice = torch.tensor([0.1961, -0.8337, -0.6389])
+    else:
+        raise ValueError(f"Model name not supported. Should be one of {model_names}")
 
     # verify logits
     assert logits.shape == expected_shape
     if "finetuned" in model_name:
         assert torch.allclose(logits[0, :3], expected_slice, atol=1e-4)
     else:
+        print("Logits:", logits[0, :3, :3])
         assert torch.allclose(logits[0, :3, :3], expected_slice, atol=1e-4)
     print("Logits ok!")
 
     # verify loss, if applicable
-    if "finetuned" not in model_name:
+    if model_name == "videomae-base-short":
         loss = outputs.loss
-        print("Loss:", loss)
         assert torch.allclose(loss, expected_loss, atol=1e-4)
         print("Loss ok!")
 
@@ -233,23 +262,24 @@ def convert_videomae_checkpoint(checkpoint_path, pytorch_dump_folder_path, model
     parser = argparse.ArgumentParser()
     # Required parameters
     parser.add_argument(
-        "--checkpoint_path",
-        default=(
-            "/Users/nielsrogge/Documents/VideoMAE/Original"
-            " checkpoints/Kinetics-400/videomae-base-finetuned-kinetics/checkpoint.pth"
-        ),
+        "--checkpoint_url",
+        default="https://drive.google.com/u/1/uc?id=1tEhLyskjb755TJ65ptsrafUG2llSwQE1&amp;export=download&amp;confirm=t&amp;uuid=aa3276eb-fb7e-482a-adec-dc7171df14c4",
         type=str,
-        help="Path of the original PyTorch checkpoint you'd like to convert.",
-    )
-    parser.add_argument(
-        "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory."
+        help=(
+            "URL of the original PyTorch checkpoint (on Google Drive) you'd like to convert. Should be a direct"
+            " download link."
+        ),
     )
     parser.add_argument(
-        "--model_name", default="videomae-base-finetuned-kinetics", type=str, help="Name of the model."
+        "--pytorch_dump_folder_path",
+        default="/Users/nielsrogge/Documents/VideoMAE/Test",
+        type=str,
+        help="Path to the output PyTorch model directory.",
     )
+    parser.add_argument("--model_name", default="videomae-base", type=str, help="Name of the model.")
     parser.add_argument(
         "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub."
     )
 
     args = parser.parse_args()
-    convert_videomae_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)
+    convert_videomae_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path, args.model_name, args.push_to_hub)

From cd3aa21a2f1ef59abd4cc5de0c32e0538c34f420 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 3 Aug 2022 11:33:37 +0200
Subject: [PATCH 39/42] Update image mean and std

---
 .../videomae/convert_videomae_to_pytorch.py    |  3 ++-
 .../videomae/feature_extraction_videomae.py    | 18 ++++++------------
 .../models/videomae/test_modeling_videomae.py  | 11 +++++++++--
 3 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/src/transformers/models/videomae/convert_videomae_to_pytorch.py b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
index 94333c184572..60e5ae8f5f41 100644
--- a/src/transformers/models/videomae/convert_videomae_to_pytorch.py
+++ b/src/transformers/models/videomae/convert_videomae_to_pytorch.py
@@ -172,7 +172,7 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
     model.eval()
 
     # verify model on basic input
-    feature_extractor = VideoMAEFeatureExtractor()
+    feature_extractor = VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
     video = prepare_video()
     inputs = feature_extractor(video, return_tensors="pt")
 
@@ -198,6 +198,7 @@ def convert_videomae_checkpoint(checkpoint_url, pytorch_dump_folder_path, model_
         "videomae-base-finetuned-ssv2",
     ]
 
+    # NOTE: logits were tested with image_mean and image_std equal to [0.5, 0.5, 0.5] and [0.5, 0.5, 0.5]
     if model_name == "videomae-base":
         expected_shape = torch.Size([1, 1408, 1536])
         expected_slice = torch.tensor([[0.7739, 0.7968, 0.7089], [0.6701, 0.7487, 0.6209], [0.4287, 0.5158, 0.4773]])
diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index 4350c56d44de..132dabda8c68 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -20,14 +20,8 @@
 from PIL import Image
 
 from ...feature_extraction_utils import BatchFeature, FeatureExtractionMixin
-from ...image_utils import (
-    IMAGENET_STANDARD_MEAN,
-    IMAGENET_STANDARD_STD,
-    ImageFeatureExtractionMixin,
-    ImageInput,
-    is_torch_tensor,
-)
-from ...utils import TensorType, logging
+from ...image_utils import ImageFeatureExtractionMixin, ImageInput, is_torch_tensor
+from ...utils import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, TensorType, logging
 
 
 logger = logging.get_logger(__name__)
@@ -53,9 +47,9 @@ class VideoMAEFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
             Whether to center crop the input to a certain `size`.
         do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
-        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
     """
 
@@ -78,8 +72,8 @@ def __init__(
         self.resample = resample
         self.do_center_crop = do_center_crop
         self.do_normalize = do_normalize
-        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
-        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
 
     def resize_video(self, video, size, resample="bilinear"):
         return [self.resize(frame, size, resample, default_to_square=False) for frame in video]
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index dd8c82b390ee..eed593f6b691 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -352,11 +352,18 @@ def prepare_video():
 class VideoMAEModelIntegrationTest(unittest.TestCase):
     @cached_property
     def default_feature_extractor(self):
-        return VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base") if is_vision_available() else None
+        # logits were tested with a different mean and std, so we use the same here
+        return (
+            VideoMAEFeatureExtractor(image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5])
+            if is_vision_available()
+            else None
+        )
 
     @slow
     def test_inference_for_video_classification(self):
-        model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base").to(torch_device)
+        model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics").to(
+            torch_device
+        )
 
         feature_extractor = self.default_feature_extractor
         video = prepare_video()

From c510c3deae6b291ca94743571076a75faa7e901f Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 3 Aug 2022 14:48:41 +0200
Subject: [PATCH 40/42] Fix doc tests

---
 .../models/videomae/modeling_videomae.py      | 60 +++++++++++++++----
 .../models/videomae/test_modeling_videomae.py |  8 ++-
 2 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/videomae/modeling_videomae.py b/src/transformers/models/videomae/modeling_videomae.py
index 0a27565ce369..a807ed7208fc 100644
--- a/src/transformers/models/videomae/modeling_videomae.py
+++ b/src/transformers/models/videomae/modeling_videomae.py
@@ -580,25 +580,29 @@ def forward(
         ```python
         >>> from decord import VideoReader, cpu
         >>> import numpy as np
+
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEModel
+        >>> from huggingface_hub import hf_hub_download
 
 
-        >>> def sample_frame_indices(clip_len, frame_sample_rate):
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
         ...     converted_len = int(clip_len * frame_sample_rate)
         ...     end_idx = np.random.randint(converted_len, seg_len)
         ...     start_idx = end_idx - converted_len
         ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
-        ...     indices = np.clip(index, start_idx, end_idx - 1).astype(np.int64)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
         ...     return indices
 
 
         >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
-        >>> video_url = "https://huggingface.co/datasets/nielsr/video-demo/resolve/main/eating_spaghetti.mp4"
-        >>> vr = VideoReader(video_url, num_threads=1, ctx=cpu(0))
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
 
         >>> # sample 16 frames
         >>> vr.seek(0)
-        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4)
+        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(vr))
         >>> buffer = vr.get_batch(indices).asnumpy()
 
         >>> # create a list of NumPy arrays
@@ -613,6 +617,8 @@ def forward(
         >>> # forward pass
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 1568, 768]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -935,17 +941,49 @@ def forward(
         Examples:
 
         ```python
+        >>> from decord import VideoReader, cpu
+        >>> import torch
+
         >>> from transformers import VideoMAEFeatureExtractor, VideoMAEForVideoClassification
-        >>> import numpy as np
+        >>> from huggingface_hub import hf_hub_download
 
-        >>> video = list(np.random.randn(16, 3, 224, 224))
 
-        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base")
-        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base")
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # video clip consists of 300 frames (10 seconds at 30 FPS)
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
+
+        >>> # sample 16 frames
+        >>> vr.seek(0)
+        >>> indices = sample_frame_indices(clip_len=16, frame_sample_rate=4, seg_len=len(vr))
+        >>> buffer = vr.get_batch(indices).asnumpy()
+
+        >>> # create a list of NumPy arrays
+        >>> video = [buffer[i] for i in range(buffer.shape[0])]
+
+        >>> feature_extractor = VideoMAEFeatureExtractor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
+        >>> model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
 
         >>> inputs = feature_extractor(video, return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+        ...     logits = outputs.logits
+
+        >>> # model predicts one of the 400 Kinetics-400 classes
+        >>> predicted_label = logits.argmax(-1).item()
+        >>> print(model.config.id2label[predicted_label])
+        eating spaghetti
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index eed593f6b691..adce62021c9d 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -399,12 +399,14 @@ def test_inference_for_pretraining(self):
 
         # verify the logits
         expected_shape = torch.Size([1, 1408, 1536])
-        expected_slice = torch.tensor([[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]])
+        expected_slice = torch.tensor(
+            [[0.7994, 0.9612, 0.8508], [0.7401, 0.8958, 0.8302], [0.5862, 0.7468, 0.7325]], device=torch_device
+        )
         self.assertEqual(outputs.logits.shape, expected_shape)
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4))
 
         # verify the loss (`config.norm_pix_loss` = `True`)
-        expected_loss = torch.tensor([0.5142])
+        expected_loss = torch.tensor([0.5142], device=torch_device)
         self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))
 
         # verify the loss (`config.norm_pix_loss` = `False`)
@@ -415,5 +417,5 @@ def test_inference_for_pretraining(self):
         with torch.no_grad():
             outputs = model(**inputs)
 
-        expected_loss = torch.tensor(torch.tensor([0.6469]))
+        expected_loss = torch.tensor(torch.tensor([0.6469]), device=torch_device)
         self.assertTrue(torch.allclose(outputs.loss, expected_loss, atol=1e-4))

From 256f2c8bd0ccf7c9d3948afb910121ca77fbdd95 Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Wed, 3 Aug 2022 15:50:19 +0200
Subject: [PATCH 41/42] Set return_tensors to NumPy by default

---
 src/transformers/models/videomae/feature_extraction_videomae.py | 2 +-
 tests/models/videomae/test_feature_extraction_videomae.py       | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index 132dabda8c68..942580b2b55e 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -98,7 +98,7 @@ def normalize_video(self, video, mean, std):
         return (video - mean[None, :, None, None]) / std[None, :, None, None]
 
     def __call__(
-        self, videos: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
+        self, videos: ImageInput, return_tensors: Optional[Union[str, TensorType]] = "np", **kwargs
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several video(s).
diff --git a/tests/models/videomae/test_feature_extraction_videomae.py b/tests/models/videomae/test_feature_extraction_videomae.py
index 971b4da729ea..cfe00f51e5e5 100644
--- a/tests/models/videomae/test_feature_extraction_videomae.py
+++ b/tests/models/videomae/test_feature_extraction_videomae.py
@@ -176,8 +176,6 @@ def test_call_pytorch(self):
             self.assertIsInstance(video[0], torch.Tensor)
 
         # Test not batched input
-        for video in video_inputs:
-            print(video[0].shape)
         encoded_videos = feature_extractor(video_inputs[0], return_tensors="pt").pixel_values
         self.assertEqual(
             encoded_videos.shape,

From aa48fee1d884ec384656e0b7331adbecd2eabb8d Mon Sep 17 00:00:00 2001
From: Niels Rogge <nielsrogge@Nielss-MacBook-Pro.local>
Date: Thu, 4 Aug 2022 13:55:59 +0200
Subject: [PATCH 42/42] Revert the previous change

---
 src/transformers/models/videomae/feature_extraction_videomae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/videomae/feature_extraction_videomae.py b/src/transformers/models/videomae/feature_extraction_videomae.py
index 942580b2b55e..132dabda8c68 100644
--- a/src/transformers/models/videomae/feature_extraction_videomae.py
+++ b/src/transformers/models/videomae/feature_extraction_videomae.py
@@ -98,7 +98,7 @@ def normalize_video(self, video, mean, std):
         return (video - mean[None, :, None, None]) / std[None, :, None, None]
 
     def __call__(
-        self, videos: ImageInput, return_tensors: Optional[Union[str, TensorType]] = "np", **kwargs
+        self, videos: ImageInput, return_tensors: Optional[Union[str, TensorType]] = None, **kwargs
     ) -> BatchFeature:
         """
         Main method to prepare for the model one or several video(s).