diff --git a/references/video_classification/README.md b/references/video_classification/README.md index 9bd1b9cc285..cbd303275e5 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -81,6 +81,7 @@ Video resnet models: ``` # number of frames per clip --clip_len 16 \ +--frame-rate 15 \ # allow for temporal jittering --clips_per_video 5 \ --batch-size 24 \ @@ -97,6 +98,21 @@ Video resnet models: --val-crop-size 112 112 ``` +### S3D + +The S3D model was trained similarly to the above but with the following changes on the default configuration: +``` +--batch-size=12 --lr 0.2 --clip-len 64 --clips-per-video 5 --sync-bn \ +--train-resize-size 256 256 --train-crop-size 224 224 --val-resize-size 256 256 --val-crop-size 224 224 +``` + +We used 64 GPUs to train the architecture. + +To estimate the validation statistics of the model, we run the reference script with the following configuration: +``` +--batch-size=16 --test-only --clip-len 128 --clips-per-video 1 +``` + ### Additional video modelling resources - [Video Model Zoo](https://github.com/facebookresearch/VMZ) diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py index f80d849683c..f7d364c665f 100644 --- a/torchvision/models/video/s3d.py +++ b/torchvision/models/video/s3d.py @@ -104,7 +104,7 @@ class S3D(nn.Module): def __init__( self, num_classes: int = 400, - dropout: float = 0.0, + dropout: float = 0.2, norm_layer: Optional[Callable[..., torch.nn.Module]] = None, ) -> None: super().__init__() @@ -153,28 +153,26 @@ def forward(self, x): class S3D_Weights(WeightsEnum): KINETICS400_V1 = Weights( - url="https://download.pytorch.org/models/s3d-1bd8ae63.pth", + url="https://download.pytorch.org/models/s3d-d76dad2f.pth", transforms=partial( VideoClassification, crop_size=(224, 224), resize_size=(256, 256), - mean=(0.5, 0.5, 0.5), - std=(0.5, 0.5, 0.5), ), meta={ "min_size": (224, 224), "min_temporal_size": 14, "categories": _KINETICS400_CATEGORIES, - "recipe": "https://github.com/pytorch/vision/pull/6412#issuecomment-1219687434", + "recipe": "https://github.com/pytorch/vision/tree/main/references/video_classification#s3d", "_docs": ( - "The weights are ported from a community repository. The accuracies are estimated on clip-level " + "The weights aim to approximate the accuracy of the paper. The accuracies are estimated on clip-level " "with parameters `frame_rate=15`, `clips_per_video=1`, and `clip_len=128`." ), "num_params": 8320048, "_metrics": { "Kinetics-400": { - "acc@1": 67.315, - "acc@5": 87.593, + "acc@1": 68.368, + "acc@5": 88.050, } }, },