Merge branch 'master' into master

holly1238 · web-flow · commit 24a53a5eca6d · 2021-04-26T08:42:42.000-07:00
diff --git a/README.md b/README.md
@@ -28,10 +28,10 @@ In case you prefer to write your tutorial in jupyter, you can use [this script](
 - Then you can build using `make docs`. This will download the data, execute the tutorials and build the documentation to `docs/` directory. This will take about 60-120 min for systems with GPUs. If you do not have a GPU installed on your system, then see next step.
 - You can skip the computationally intensive graph generation by running `make html-noplot` to build basic html documentation to `_build/html`. This way, you can quickly preview your tutorial.
 
-> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2**, from /tutorials/src/pytorch-sphinx-theme run `python setup.py install`. 
+> If you get **ModuleNotFoundError: No module named 'pytorch_sphinx_theme' make: *** [html-noplot] Error 2** from /tutorials/src/pytorch-sphinx-theme or /venv/src/pytorch-sphinx-theme (while using virtualenv), run `python setup.py install`. 
 
 
 ## About contributing to PyTorch Documentation and Tutorials
 * You can find information about contributing to PyTorch documentation in the 
 PyTorch Repo [README.md](https://github.com/pytorch/pytorch/blob/master/README.md) file. 
-* Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
+* Additional information can be found in [PyTorch CONTRIBUTING.md](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md).
diff --git a/beginner_source/basics/autogradqs_tutorial.py b/beginner_source/basics/autogradqs_tutorial.py
@@ -58,7 +58,7 @@
 # A function that we apply to tensors to construct computational graph is
 # in fact an object of class ``Function``. This object knows how to
 # compute the function in the *forward* direction, and also how to compute
-# it's derivative during the *backward propagation* step. A reference to
+# its derivative during the *backward propagation* step. A reference to
 # the backward propagation function is stored in ``grad_fn`` property of a
 # tensor. You can find more information of ``Function`` `in the
 # documentation <https://pytorch.org/docs/stable/autograd.html#function>`__.
diff --git a/beginner_source/basics/buildmodel_tutorial.py b/beginner_source/basics/buildmodel_tutorial.py
@@ -67,7 +67,7 @@ def forward(self, x):
 
 ##############################################
 # We create an instance of ``NeuralNetwork``, and move it to the ``device``, and print 
-# it's structure.
+# its structure.
 
 model = NeuralNetwork().to(device)
 print(model)
@@ -119,7 +119,7 @@ def forward(self, x):
 # nn.Linear 
 # ^^^^^^^^^^^^^^^^^^^^^^
 # The `linear layer <https://pytorch.org/docs/stable/generated/torch.nn.Linear.html>`_
-# is a module that applies a linear transformation on the input using it's stored weights and biases.
+# is a module that applies a linear transformation on the input using its stored weights and biases.
 #
 layer1 = nn.Linear(in_features=28*28, out_features=20)
 hidden1 = layer1(flat_image)
diff --git a/beginner_source/basics/data_tutorial.py b/beginner_source/basics/data_tutorial.py
@@ -225,7 +225,7 @@ def __getitem__(self, idx):
 # --------------------------
 #
 # We have loaded that dataset into the ``Dataloader`` and can iterate through the dataset as needed.
-# Each iteration below returns a batch of ``train_features`` and ``train_labels``(containing ``batch_size=64`` features and labels respectively).
+# Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively).
 # Because we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over 
 # the data loading order, take a look at `Samplers <https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler>`_).
 
diff --git a/beginner_source/blitz/README.txt b/beginner_source/blitz/README.txt
@@ -13,12 +13,11 @@ Deep Learning with PyTorch: A 60 Minute Blitz
 	Neural Networks
 	https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html#
 
-4. autograd_tutorial.py
-	Automatic Differentiation 
-	https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html
-
-5. cifar10_tutorial.py
+4. cifar10_tutorial.py
 	Training a Classifier
 	https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
 
+5. data_parallel_tutorial.py 
+	Optional: Data Parallelism
+	https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
 
diff --git a/beginner_source/blitz/cifar10_tutorial.py b/beginner_source/blitz/cifar10_tutorial.py
@@ -246,10 +246,13 @@ def forward(self, x):
 
 correct = 0
 total = 0
+# since we're not training, we don't need to calculate the gradients for our outputs
 with torch.no_grad():
     for data in testloader:
         images, labels = data
+        # calculate outputs by running images through the network 
         outputs = net(images)
+        # the class with the highest energy is what we choose as prediction
         _, predicted = torch.max(outputs.data, 1)
         total += labels.size(0)
         correct += (predicted == labels).sum().item()
@@ -265,23 +268,28 @@ def forward(self, x):
 # Hmmm, what are the classes that performed well, and the classes that did
 # not perform well:
 
-class_correct = list(0. for i in range(10))
-class_total = list(0. for i in range(10))
+# prepare to count predictions for each class
+correct_pred = {classname: 0 for classname in classes}
+total_pred = {classname: 0 for classname in classes}
+
+# again no gradients needed
 with torch.no_grad():
     for data in testloader:
-        images, labels = data
-        outputs = net(images)
-        _, predicted = torch.max(outputs, 1)
-        c = (predicted == labels).squeeze()
-        for i in range(4):
-            label = labels[i]
-            class_correct[label] += c[i].item()
-            class_total[label] += 1
-
-
-for i in range(10):
-    print('Accuracy of %5s : %2d %%' % (
-        classes[i], 100 * class_correct[i] / class_total[i]))
+        images, labels = data    
+        outputs = net(images)    
+        _, predictions = torch.max(outputs, 1)
+        # collect the correct predictions for each class
+        for label, prediction in zip(labels, predictions):
+            if label == prediction:
+                correct_pred[classes[label]] += 1
+            total_pred[classes[label]] += 1
+
+  
+# print accuracy for each class
+for classname, correct_count in correct_pred.items():
+    accuracy = 100 * float(correct_count) / total_pred[classname]
+    print("Accuracy for class {:5s} is: {:.1f} %".format(classname, 
+                                                   accuracy))
 
 ########################################################################
 # Okay, so what next?
diff --git a/beginner_source/blitz/neural_networks_tutorial.py b/beginner_source/blitz/neural_networks_tutorial.py
@@ -176,8 +176,9 @@ def num_flat_features(self, x):
 #           -> loss
 #
 # So, when we call ``loss.backward()``, the whole graph is differentiated
-# w.r.t. the loss, and all Tensors in the graph that have ``requires_grad=True``
-# will have their ``.grad`` Tensor accumulated with the gradient.
+# w.r.t. the neural net parameters, and all Tensors in the graph that have
+# ``requires_grad=True`` will have their ``.grad`` Tensor accumulated with the
+# gradient.
 #
 # For illustration, let us follow a few steps backward:
 
diff --git a/beginner_source/chatbot_tutorial.py b/beginner_source/chatbot_tutorial.py
@@ -471,7 +471,7 @@ def trimRareWords(voc, pairs, MIN_COUNT):
 # with mini-batches.
 #
 # Using mini-batches also means that we must be mindful of the variation
-# of sentence length in our batches. To accomodate sentences of different
+# of sentence length in our batches. To accommodate sentences of different
 # sizes in the same batch, we will make our batched input tensor of shape
 # *(max_length, batch_size)*, where sentences shorter than the
 # *max_length* are zero padded after an *EOS_token*.
@@ -615,7 +615,7 @@ def batch2TrainData(voc, pair_batch):
 # in normal sequential order, and one that is fed the input sequence in
 # reverse order. The outputs of each network are summed at each time step.
 # Using a bidirectional GRU will give us the advantage of encoding both
-# past and future context.
+# past and future contexts.
 #
 # Bidirectional RNN:
 #
@@ -700,7 +700,7 @@ def forward(self, input_seq, input_lengths, hidden=None):
 # states to generate the next word in the sequence. It continues
 # generating words until it outputs an *EOS_token*, representing the end
 # of the sentence. A common problem with a vanilla seq2seq decoder is that
-# if we rely soley on the context vector to encode the entire input
+# if we rely solely on the context vector to encode the entire input
 # sequence’s meaning, it is likely that we will have information loss.
 # This is especially the case when dealing with long input sequences,
 # greatly limiting the capability of our decoder.
@@ -950,7 +950,7 @@ def maskNLLLoss(inp, target, mask):
 #   sequence (or batch of sequences). We use the ``GRU`` layer like this in
 #   the ``encoder``. The reality is that under the hood, there is an
 #   iterative process looping over each time step calculating hidden states.
-#   Alternatively, you ran run these modules one time-step at a time. In
+#   Alternatively, you can run these modules one time-step at a time. In
 #   this case, we manually loop over the sequences during the training
 #   process like we must do for the ``decoder`` model. As long as you
 #   maintain the correct conceptual model of these modules, implementing
@@ -1115,7 +1115,7 @@ def trainIters(model_name, voc, pairs, encoder, decoder, encoder_optimizer, deco
 # softmax value. This decoding method is optimal on a single time-step
 # level.
 #
-# To facilite the greedy decoding operation, we define a
+# To facilitate the greedy decoding operation, we define a
 # ``GreedySearchDecoder`` class. When run, an object of this class takes
 # an input sequence (``input_seq``) of shape *(input_seq length, 1)*, a
 # scalar input length (``input_length``) tensor, and a ``max_length`` to
diff --git a/beginner_source/dcgan_faces_tutorial.py b/beginner_source/dcgan_faces_tutorial.py
@@ -71,7 +71,7 @@
 # :math:`D` and :math:`G` play a minimax game in which :math:`D` tries to
 # maximize the probability it correctly classifies reals and fakes
 # (:math:`logD(x)`), and :math:`G` tries to minimize the probability that
-# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(x)))`).
+# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(z)))`).
 # From the paper, the GAN loss function is
 # 
 # .. math:: \underset{G}{\text{min}} \underset{D}{\text{max}}V(D,G) = \mathbb{E}_{x\sim p_{data}(x)}\big[logD(x)\big] + \mathbb{E}_{z\sim p_{z}(z)}\big[log(1-D(G(z)))\big]
diff --git a/beginner_source/nn_tutorial.py b/beginner_source/nn_tutorial.py
@@ -85,7 +85,6 @@
     torch.tensor, (x_train, y_train, x_valid, y_valid)
 )
 n, c = x_train.shape
-x_train, x_train.shape, y_train.min(), y_train.max()
 print(x_train, y_train)
 print(x_train.shape)
 print(y_train.min(), y_train.max())
diff --git a/index.rst b/index.rst
@@ -132,7 +132,7 @@ Welcome to PyTorch Tutorials
    :header: Speech Command Recognition
    :card_description: Learn how to correctly format an audio dataset and then train/test an audio classifier network on the dataset.
    :image: _static/img/thumbnails/cropped/torchaudio-speech.png
-   :link: intermediate/speech_command_recognition_with_torchaudio.html
+   :link: intermediate/speech_command_recognition_with_torchaudio_tutorial.html
    :tags: Audio
 
 .. Text
@@ -578,8 +578,7 @@ Additional Resources
    :caption: Audio
 
    beginner/audio_preprocessing_tutorial
-   intermediate/speech_command_recognition_with_torchaudio
-
+   intermediate/speech_command_recognition_with_torchaudio_tutorial
 
 .. toctree::
    :maxdepth: 2
diff --git a/intermediate_source/reinforcement_q_learning.py b/intermediate_source/reinforcement_q_learning.py
@@ -63,7 +63,7 @@
 import numpy as np
 import matplotlib
 import matplotlib.pyplot as plt
-from collections import namedtuple
+from collections import namedtuple, deque
 from itertools import count
 from PIL import Image
 
@@ -115,16 +115,11 @@
 class ReplayMemory(object):
 
     def __init__(self, capacity):
-        self.capacity = capacity
-        self.memory = []
-        self.position = 0
+        self.memory = deque([],maxlen=capacity)
 
     def push(self, *args):
-        """Saves a transition."""
-        if len(self.memory) < self.capacity:
-            self.memory.append(None)
-        self.memory[self.position] = Transition(*args)
-        self.position = (self.position + 1) % self.capacity
+        """Save a transition"""
+        self.memory.append(Transition(*args))
 
     def sample(self, batch_size):
         return random.sample(self.memory, batch_size)
@@ -432,7 +427,8 @@ def optimize_model():
     expected_state_action_values = (next_state_values * GAMMA) + reward_batch
 
     # Compute Huber loss
-    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
+    criterion = nn.SmoothL1Loss()
+    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
 
     # Optimize the model
     optimizer.zero_grad()
diff --git a/intermediate_source/rpc_tutorial.rst b/intermediate_source/rpc_tutorial.rst
@@ -9,8 +9,8 @@ Prerequisites:
 -  `RPC API documents <https://pytorch.org/docs/master/rpc.html>`__
 
 This tutorial uses two simple examples to demonstrate how to build distributed
-training with the `torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__
-package which is first introduced as a prototype feature in PyTorch v1.4.
+training with the `torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__
+package which was first introduced as an experimental feature in PyTorch v1.4.
 Source code of the two examples can be found in
 `PyTorch examples <https://github.com/pytorch/examples>`__.
 
@@ -36,19 +36,19 @@ paradigms. For example:
    machines.
 
 
-The `torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__ package
-can help with the above scenarios. In case 1, `RPC <https://pytorch.org/docs/master/rpc.html#rpc>`__
-and `RRef <https://pytorch.org/docs/master/rpc.html#rref>`__ allow sending data
+The `torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ package
+can help with the above scenarios. In case 1, `RPC <https://pytorch.org/docs/stable/rpc.html#rpc>`__
+and `RRef <https://pytorch.org/docs/stable/rpc.html#rref>`__ allow sending data
 from one worker to another while easily referencing remote data objects. In
-case 2, `distributed autograd <https://pytorch.org/docs/master/rpc.html#distributed-autograd-framework>`__
-and `distributed optimizer <https://pytorch.org/docs/master/rpc.html#module-torch.distributed.optim>`__
+case 2, `distributed autograd <https://pytorch.org/docs/stable/rpc.html#distributed-autograd-framework>`__
+and `distributed optimizer <https://pytorch.org/docs/stable/rpc.html#module-torch.distributed.optim>`__
 make executing backward pass and optimizer step as if it is local training. In
 the next two sections, we will demonstrate APIs of
-`torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__ using a
+`torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ using a
 reinforcement learning example and a language model example. Please note, this
 tutorial does not aim at building the most accurate or efficient models to
 solve given problems, instead, the main goal here is to show how to use the
-`torch.distributed.rpc <https://pytorch.org/docs/master/rpc.html>`__ package to
+`torch.distributed.rpc <https://pytorch.org/docs/stable/rpc.html>`__ package to
 build distributed training applications.
 
 
@@ -289,10 +289,10 @@ observers. The agent serves as master by repeatedly calling ``run_episode`` and
 ``finish_episode`` until the running reward surpasses the reward threshold
 specified by the environment. All observers passively waiting for commands
 from the agent. The code is wrapped by
-`rpc.init_rpc <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.init_rpc>`__ and
-`rpc.shutdown <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.shutdown>`__,
+`rpc.init_rpc <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.init_rpc>`__ and
+`rpc.shutdown <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.shutdown>`__,
 which initializes and terminates RPC instances respectively. More details are
-available in the `API page <https://pytorch.org/docs/master/rpc.html>`__.
+available in the `API page <https://pytorch.org/docs/stable/rpc.html>`__.
 
 
 .. code:: python
@@ -442,7 +442,7 @@ takes a GPU tensor, you need to move it to the proper device explicitly.
 With the above sub-modules, we can now piece them together using RPC to
 create an RNN model. In the code below ``ps`` represents a parameter server,
 which hosts parameters of the embedding table and the decoder. The constructor
-uses the `remote <https://pytorch.org/docs/master/rpc.html#torch.distributed.rpc.remote>`__
+uses the `remote <https://pytorch.org/docs/stable/rpc.html#torch.distributed.rpc.remote>`__
 API to create an ``EmbeddingTable`` object and a ``Decoder`` object on the
 parameter server, and locally creates the ``LSTM`` sub-module. During the
 forward pass, the trainer uses the ``EmbeddingTable`` ``RRef`` to find the
diff --git a/intermediate_source/speech_command_recognition_with_torchaudio_tutorial.py b/intermediate_source/speech_command_recognition_with_torchaudio_tutorial.py
diff --git a/intermediate_source/torchvision_tutorial.rst b/intermediate_source/torchvision_tutorial.rst

Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ def __getitem__(self, idx):`
`225`	`225`	`# --------------------------`
`226`	`226`	`#`
`227`	`227`	# We have loaded that dataset into the ``Dataloader`` and can iterate through the dataset as needed.
`228`		-# Each iteration below returns a batch of ``train_features`` and ``train_labels``(containing ``batch_size=64`` features and labels respectively).
	`228`	+# Each iteration below returns a batch of ``train_features`` and ``train_labels`` (containing ``batch_size=64`` features and labels respectively).
`229`	`229`	# Because we specified ``shuffle=True``, after we iterate over all batches the data is shuffled (for finer-grained control over
`230`	`230`	# the data loading order, take a look at `Samplers <https://pytorch.org/docs/stable/data.html#data-loading-order-and-sampler>`_).
`231`	`231`
Original file line number	Diff line number	Diff line change
`@@ -176,8 +176,9 @@ def num_flat_features(self, x):`
`176`	`176`	`# -> loss`
`177`	`177`	`#`
`178`	`178`	# So, when we call ``loss.backward()``, the whole graph is differentiated
`179`		-# w.r.t. the loss, and all Tensors in the graph that have ``requires_grad=True``
`180`		-# will have their ``.grad`` Tensor accumulated with the gradient.
	`179`	`+# w.r.t. the neural net parameters, and all Tensors in the graph that have`
	`180`	+# ``requires_grad=True`` will have their ``.grad`` Tensor accumulated with the
	`181`	`+# gradient.`
`181`	`182`	`#`
`182`	`183`	`# For illustration, let us follow a few steps backward:`
`183`	`184`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@`
`71`	`71`	# :math:`D` and :math:`G` play a minimax game in which :math:`D` tries to
`72`	`72`	`# maximize the probability it correctly classifies reals and fakes`
`73`	`73`	# (:math:`logD(x)`), and :math:`G` tries to minimize the probability that
`74`		-# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(x)))`).
	`74`	+# :math:`D` will predict its outputs are fake (:math:`log(1-D(G(z)))`).
`75`	`75`	`# From the paper, the GAN loss function is`
`76`	`76`	`#`
`77`	`77`	`# .. math:: \underset{G}{\text{min}} \underset{D}{\text{max}}V(D,G) = \mathbb{E}_{x\sim p_{data}(x)}\big[logD(x)\big] + \mathbb{E}_{z\sim p_{z}(z)}\big[log(1-D(G(z)))\big]`
Original file line number	Diff line number	Diff line change
`@@ -85,7 +85,6 @@`
`85`	`85`	`torch.tensor, (x_train, y_train, x_valid, y_valid)`
`86`	`86`	`)`
`87`	`87`	`n, c = x_train.shape`
`88`		`-x_train, x_train.shape, y_train.min(), y_train.max()`
`89`	`88`	`print(x_train, y_train)`
`90`	`89`	`print(x_train.shape)`
`91`	`90`	`print(y_train.min(), y_train.max())`