diff --git a/docs/source/nn_modules.rst b/docs/source/nn_modules.rst
index 064f51488a..5b979581e0 100644
--- a/docs/source/nn_modules.rst
+++ b/docs/source/nn_modules.rst
@@ -1,11 +1,11 @@
 .. role:: hidden
     :class: hidden-section
 
-torchtext.nn.modules.multiheadattention
+torchtext.nn
 =======================================
 
-.. automodule:: torchtext.nn.modules.multiheadattention
-.. currentmodule:: torchtext.nn.modules.multiheadattention
+.. automodule:: torchtext.nn
+.. currentmodule:: torchtext.nn
 
 :hidden:`MultiheadAttentionContainer`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/torchtext/nn/modules/multiheadattention.py b/torchtext/nn/modules/multiheadattention.py
index b581d245c4..e0909d70b3 100644
--- a/torchtext/nn/modules/multiheadattention.py
+++ b/torchtext/nn/modules/multiheadattention.py
@@ -20,6 +20,7 @@ def __init__(self, nhead, in_proj_container, attention_layer, out_proj, batch_fi
 
         Examples::
             >>> import torch
+            >>> from torchtext.nn import MultiheadAttentionContainer, InProjContainer, ScaledDotProduct
             >>> embed_dim, num_heads, bsz = 10, 5, 64
             >>> in_proj_container = InProjContainer(torch.nn.Linear(embed_dim, embed_dim),
                                                     torch.nn.Linear(embed_dim, embed_dim),
@@ -122,6 +123,7 @@ def __init__(self, dropout=0.0, batch_first=False):
                 as `(batch, seq, feature)`. Default: ``False``
 
         Examples::
+            >>> import torch, torchtext
             >>> SDP = torchtext.nn.ScaledDotProduct(dropout=0.1)
             >>> q = torch.randn(21, 256, 3)
             >>> k = v = torch.randn(21, 256, 3)
@@ -245,6 +247,7 @@ def forward(self,
             value (Tensor): The values to be projected.
 
         Examples::
+            >>> import torch
             >>> from torchtext.nn import InProjContainer
             >>> embed_dim, bsz = 10, 64
             >>> in_proj_container = InProjContainer(torch.nn.Linear(embed_dim, embed_dim),