pytorch · albanD · Dec 9, 2020 · Dec 9, 2020 · Dec 9, 2020 · Dec 9, 2020
diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp
@@ -292,6 +292,11 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi
         stream << ", axis: " << tensor_.q_per_channel_axis();
       }
     }
+
+    auto& fw_grad = tensor.fw_grad(/* level */ 0);
+    if (fw_grad.defined()) {
+      stream << ", tangent:" << std::endl << fw_grad;
+    }
     stream << " ]";
   }
   return stream;

diff --git a/aten/src/ATen/core/NamedRegistrations.cpp b/aten/src/ATen/core/NamedRegistrations.cpp
@@ -509,4 +509,5 @@ TORCH_LIBRARY_IMPL(aten, Named, m) {
   m.impl("_version", CppFunction::makeFallthrough());
   m.impl("requires_grad_", CppFunction::makeFallthrough());
   m.impl("retain_grad", CppFunction::makeFallthrough());
+  m.impl("_fw_primal", CppFunction::makeFallthrough());
 }
diff --git a/aten/src/ATen/native/AutogradComposite.cpp b/aten/src/ATen/native/AutogradComposite.cpp
@@ -0,0 +1,27 @@
+#include <ATen/ATen.h>
+
+namespace at {
+namespace native {
+
+/// This function can be used to create a dual Tensor that holds a tangent to compute forward mode gradients.
+/// Note that the dual Tensor's primal is a view of the given primal and the given tangent is used as-is.
+/// This function is backward differentiable.
+at::Tensor make_dual(const at::Tensor& primal, const at::Tensor& tangent, int64_t level) {
+  TORCH_CHECK(!primal.fw_grad(level).defined(), "Making a dual Tensor based on a Tensor that "
+              "already has a forward gradient at the same level ", level, " is not supported.");
+
+  auto dual_tensor = primal.view(primal.sizes());
+  dual_tensor.set_fw_grad(tangent, level, /* is_inplace_op */ false);
+  return dual_tensor;
+}
+
+/// This function can be used to unpack a given dual Tensor to get its primal and tangent. The returned primal
+/// is a view of the dual and the tangent is returned as is.
+/// This function is backward differentiable.
+std::tuple<at::Tensor, at::Tensor> unpack_dual(const at::Tensor& tensor, int64_t level) {
+  return std::tuple<at::Tensor, at::Tensor>(tensor._fw_primal(level), tensor.fw_grad(level));
+}
+
+} // namespace native
+
+} // namespace at
diff --git a/aten/src/ATen/native/VariableMethodStubs.cpp b/aten/src/ATen/native/VariableMethodStubs.cpp
@@ -40,5 +40,9 @@ void retain_grad(Tensor& self) {
   AT_ERROR("retain_grad is not implemented for Tensor");
 }
 
+Tensor _fw_primal(const Tensor& self, int64_t level) {
+  AT_ERROR("_fw_primal is not implemented for Tensor");
+}
+
 } // namespace native
 } // namespace at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -105,6 +105,20 @@
   manual_kernel_registration: True
   variants: method
 
+- func: _fw_primal(Tensor(a) self, int level) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: method
+  dispatch:
+    DefaultBackend: _fw_primal
+
+- func: make_dual(Tensor(a) primal, Tensor tangent, int level) -> Tensor(a)
+  use_c10_dispatcher: full
+  variants: function
+
+- func: unpack_dual(Tensor(a) dual, int level) -> (Tensor(a) primal, Tensor tangent)
+  use_c10_dispatcher: full
+  variants: function
+
 - func: rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)
   use_c10_dispatcher: full
   variants: method

diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
@@ -599,6 +599,23 @@ class CAFFE2_API Tensor {
     return impl_->grad();
   }
 
+  // The Forward AD API functions below are low level and are not to be used by end
+  // users who should use the API provided in torch/csrc/autograd.h
+
+  /// This function returns the forward gradient for this Tensor at the given level.
+  const Tensor& fw_grad(uint64_t level) const {
+    return impl_->fw_grad(level, *this);
+  }
+
+  /// This function can be used to set the value of the forward grad.
+  /// Note that the given new_grad might not be used directly if it has different
+  /// metadata (size/stride/storage offset) compared to this Tensor. In that case,
+  /// new_grad content will be copied into a new Tensor
+  void set_fw_grad(const Tensor& new_grad, uint64_t level, bool is_inplace_op) {
+    impl_->set_fw_grad(new_grad, *this, level, is_inplace_op);
+  }
+
+
   // STOP.  Thinking of adding a method here, which only makes use
   // of other ATen methods?  Define it in native_functions.yaml.
 

diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
@@ -44,6 +44,17 @@ const at::Tensor& TensorImpl::grad() const {
   return autograd_meta_->grad();
 }
 
+const at::Tensor& TensorImpl::fw_grad(uint64_t level, const at::Tensor& self) const {
+  // See TensorImpl::grad() above for explanation about the line below
+  if (!autograd_meta_) return impl::GetAutogradMetaFactory()->undefined_tensor();
+  return autograd_meta_->fw_grad(level, self);
+}
+
+void TensorImpl::set_fw_grad(const at::Tensor& new_grad, const at::Tensor& self, uint64_t level, bool is_inplace_op) {
+  if (!autograd_meta_) autograd_meta_ = impl::GetAutogradMetaFactory()->make();
+  autograd_meta_->set_fw_grad(new_grad, self, level, is_inplace_op);
+}
+
 TensorImpl::TensorImpl(
     Storage&& storage,
     DispatchKeySet key_set,

diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
@@ -136,6 +136,8 @@ struct C10_API AutogradMetaInterface {
   virtual bool requires_grad() const = 0;
   virtual at::Tensor& mutable_grad() = 0;
   virtual const at::Tensor& grad() const = 0;
+  virtual const at::Tensor& fw_grad(uint64_t level, const at::Tensor& self) const = 0;
+  virtual void set_fw_grad(const at::Tensor& new_grad, const at::Tensor& self, uint64_t level, bool is_inplace_op) = 0;
   virtual ~AutogradMetaInterface();
 };
 
@@ -598,6 +600,42 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
    */
   const at::Tensor& grad() const;
 
+  /**
+   * Return the accumulated gradient of a tensor. This gradient is computed
+   * using forward mode AD.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be returned. Note that since levels are not fully
+   *     supported yet, this argument should be 0. See documentation for
+   *     torch::autograd::enter_dual_level for more details about forward AD nesting.
+   *   - "self" should represent the Tensor whose forward grad is accessed. It is
+   *     required when dealing with view.
+   */
+  const at::Tensor& fw_grad(uint64_t level, const at::Tensor& self) const;
+
+  /**
+   * Sets the forward gradient for this Tensor.
+   * The given Tensor might not be used directly and its content will be copied.
+   *
+   * This is an internal API that should never be used by end users.
+   *
+   * The API is as follows:
+   *   - "new_grad" is a Tensor containing the new value of the gradient that should
+   *     be set
+   *   - "self" should reprensent the Tensor whose forward grad is accessed. It is
+   *     required when dealing with view.
+   *   - "level" allows to specify the level of forward AD nesting for which the
+   *     gradient should be set. Note that since levels are not fully supported
+   *     yet, this argument should be 0. See documentation for torch::autograd::enter_dual_level
+   *     for more details about forward AD nesting.
+   *   - "is_inplace_op" is a boolean flag that tells if this gradient was generated
+   *     by an inplace operation or an out of place one. This allows better error checking.
+   */
+  void set_fw_grad(const at::Tensor& new_grad, const at::Tensor& self, uint64_t level, bool is_inplace_op);
+
   /**
    * Return a typed data pointer to the actual data which this tensor refers to.
    * This checks that the requested type (from the template parameter) matches

diff --git a/test/test_autograd.py b/test/test_autograd.py
@@ -35,6 +35,7 @@
                                                   IS_WINDOWS, IS_MACOS, CudaMemoryLeakCheck)
 from torch.autograd import Variable, Function, detect_anomaly, kineto_available
 from torch.autograd.function import InplaceFunction
+import torch.autograd.forward_ad as fwAD
 from torch.testing import randn_like
 from torch.testing._internal.common_methods_invocations import (method_tests,
                                                                 create_input, unpack_variables,
@@ -5350,6 +5351,26 @@ def fn(a, dim0_size=5):
 
         self.assertEqual(x.grad, y.grad)
 
+    def test_view_with_multi_output(self):
+        x = torch.randn(2, 2, 2, dtype=torch.double)
+
+        x1 = torch.view_as_complex(x)
+        # Taking an invalid view should always be allowed as long as it is not
+        # modified inplace
+        res = x1.unbind(0)
+
+        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+            res[0] += torch.rand(2, requires_grad=True)
+
+        x.requires_grad_(True)
+        x1 = torch.view_as_complex(x)
+        # Taking an invalid view should always be allowed as long as it is not
+        # modified inplace
+        res = x1.unbind(0)
+
+        with self.assertRaisesRegex(RuntimeError, "output of a function that returns multiple views"):
+            res[0] += torch.rand(2, requires_grad=True)
+
     def as_identity(self):
         # view_as_real and view_as_complex behavior should be like an identity
         def func(z):
@@ -6348,6 +6369,66 @@ def foo(a):
         self.assertEqual(hvp, torch.mm(hes, v.unsqueeze(1)).squeeze(1))
         self.assertEqual(vhp, torch.mm(v.unsqueeze(0), hes).squeeze(0))
 
+class TestAutogradForwardMode(TestCase):
+    def test_forward_level_cleanup(self):
+        import weakref
+
+        def get_tensor_and_weak_ref():
+            # Helper function to get a Tensor and a weak ref that tells us
+            # if the c++ version of this Tensor is still alive or not.
+            #
+            # Create the following reference chain to do so:
+            #   - python Tensor t
+            #   - c++ Tensor corresponding by t
+            #   - c++ Node corresponding to t.grad_fn
+            #   - python dict of metadata from this Node
+            #   - an object in this dict that we can take a weakref of
+
+
+            # Create a new Tensor and Node
+            t = torch.rand(2, requires_grad=True).clone()
+            # Create the metadata dict
+            meta_dict = t.grad_fn.metadata
+            # Create the object in the dict
+
+            class Foo(object):
+                pass
+            my_obj = Foo()
+            meta_dict[0] = my_obj
+
+            # After exiting this function, the python Tensor t is the only
+            # thing keeping ref alive
+            ref = weakref.ref(my_obj)
+            return t, ref
+
+        # Sanity check that the helper function works as expected
+        t, t_ref = get_tensor_and_weak_ref()
+        self.assertIsNotNone(t_ref())
+
+        del t
+        self.assertIsNone(t_ref())
+
+        # Main test code
+        foo = torch.rand(2)
+
+        with fwAD.dual_level():
+            tangent, tangent_ref = get_tensor_and_weak_ref()
+            self.assertIsNotNone(tangent_ref())
+
+            dual = fwAD.make_dual(foo, tangent)
+            self.assertIsNotNone(tangent_ref())
+
+            # Make sure that the tangent we provided has been re-used as is
+            self.assertTrue(fwAD.unpack_dual(dual)[1] is tangent)
+
+            # Make sure that dual is keeping the tangent alive
+            del tangent
+            self.assertIsNotNone(tangent_ref())
+
+            # Make sure that the dual level does not keep the c++
+            # version of the tangent alive
+            del dual
+            self.assertIsNone(tangent_ref())
 
 # Generic device type autograd tests.
 class TestAutogradDeviceType(TestCase):

diff --git a/test/test_namedtuple_return_api.py b/test/test_namedtuple_return_api.py
@@ -12,7 +12,7 @@
 all_operators_with_namedtuple_return = {
     'max', 'min', 'median', 'nanmedian', 'mode', 'kthvalue', 'svd', 'symeig', 'eig',
     'qr', 'geqrf', 'solve', 'slogdet', 'sort', 'topk', 'lstsq',
-    'triangular_solve', 'cummax', 'cummin', 'linalg_eigh'
+    'triangular_solve', 'cummax', 'cummin', 'linalg_eigh', "unpack_dual"
 }
 
 
@@ -65,6 +65,7 @@ def test_namedtuple_return(self):
             op(operators=['triangular_solve'], input=(a,), names=('solution', 'cloned_coefficient'), hasout=True),
             op(operators=['lstsq'], input=(a,), names=('solution', 'QR'), hasout=True),
             op(operators=['linalg_eigh'], input=("L",), names=('eigenvalues', 'eigenvectors'), hasout=True),
+            op(operators=['unpack_dual'], input=(a, 0), names=('primal', 'tangent'), hasout=False),
         ]
 
         for op in operators:
@@ -75,7 +76,9 @@ def test_namedtuple_return(self):
                     for i, name in enumerate(op.names):
                         self.assertIs(getattr(ret, name), ret[i])
                 else:
-                    ret = getattr(a, f)(*op.input)
+                    # Handle op that are not methods
+                    func = getattr(a, f) if hasattr(a, f) else getattr(torch, f)
+                    ret = func(*op.input)
                     for i, name in enumerate(op.names):
                         self.assertIs(getattr(ret, name), ret[i])
                     if op.hasout:

diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
@@ -80,7 +80,8 @@
     'nonzero(_(out|numpy))?',
     'set_data',
     '.*_overrideable',  # overrideable functions for backend extension
-    'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_'
+    'data', 'is_leaf', 'output_nr', '_version', 'requires_grad_', 'retain_grad', 'set_',
+    '_fw_primal'
 ]
 
 # These function signatures are not exposed to Python. Note that this signature

diff --git a/tools/autograd/gen_trace_type.py b/tools/autograd/gen_trace_type.py
@@ -25,7 +25,7 @@
 # For these ops we want to skip the codegen-ed registration to both Autograd and Tracer keys.
 # You can find the manual registration in torch/csrc/autograd/VariableTypeManual.cpp
 MANUAL_AUTOGRAD_AND_TRACER = set([
-    'resize_', 'resize_as_', 'detach', 'detach_', 'copy_',
+    'resize_', 'resize_as_', 'detach', 'detach_', 'copy_', '_fw_primal',
 ])
 
 # Currently MANUAL_AUTOGRAD and MANUAL_TRACER share the same set of ops:

diff --git a/tools/autograd/gen_variable_type.py b/tools/autograd/gen_variable_type.py
@@ -689,7 +689,7 @@ def wrap_output(return_values, var):
 
             if len(differentiable_output_vars) == 0:
                 # no output is differentiable (.indices() for SparseTensors for example)
-                rhs_value = 'as_view({}, {}, /* is_differentiable */ false)'.format(view_info, var)
+                rhs_value = f'as_view({view_info}, {var}, /* is_bw_differentiable */ false, /* is_fw_differentiable */ false)'
             elif len(differentiable_output_vars) == 1:
                 # Single differentiable output (Tensor or Tensor[])
                 return_info = differentiable_outputs[0]
@@ -704,13 +704,15 @@ def wrap_output(return_values, var):
                         creation_meta = "CreationMeta::MULTI_OUTPUT_SAFE"
                     else:
                         creation_meta = "CreationMeta::MULTI_OUTPUT_NODE"
-                    call += ("as_view(/* base */ {}, /* output */ {}, /* is_differentiable */ true, "
-                             "/* creation_meta */ {});\n").format(view_info, var, creation_meta)
+                    call += ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
+                             "/* is_fw_differentiable */ true, "
+                             "/* creation_meta */ {});").format(view_info, var, creation_meta)
                     rhs_value = 'std::move({})'.format(var)
                 else:
                     call += emit_view_lambda()
                     creation_meta = "GradMode::is_enabled() ? CreationMeta::DEFAULT: CreationMeta::NO_GRAD_MODE"
-                    rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_differentiable */ true, "
+                    rhs_value = ("as_view(/* base */ {}, /* output */ {}, /* is_bw_differentiable */ true, "
+                                 "/* is_fw_differentiable */ true, "
                                  "/* view_func */ func, /* creation_meta */ {})").format(view_info, var, creation_meta)
             else:
                 # This could be supported but we don't need it at the moment, so keeping things simple.

diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
@@ -90,6 +90,8 @@ core_sources_common = [
     "torch/csrc/autograd/profiler_legacy.cpp",
     "torch/csrc/autograd/profiler_kineto.cpp",
     "torch/csrc/autograd/profiler_utils.cpp",
+    "torch/csrc/autograd/autograd_meta.cpp",
+    "torch/csrc/autograd/forward_grad.cpp",
     "torch/csrc/jit/frontend/edit_distance.cpp",
     "torch/csrc/jit/frontend/string_to_type.cpp",
     "torch/csrc/jit/mobile/type_parser.cpp",

diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
@@ -522,6 +522,12 @@ def autocast_increment_nesting() -> _int: ...
 def autocast_decrement_nesting() -> _int: ...
 def set_anomaly_enabled(enabled: _bool) -> None: ...
 def is_anomaly_enabled() -> _bool: ...
+def _enter_dual_level() -> _int: ...
+def _exit_dual_level(level: _int) -> None: ...
+def _make_dual(tensor: Tensor, tangent: Tensor, level: _int) -> Tensor: ...
+def _unpack_dual(tensor: Tensor, level: _int) -> Tensor: ...
+def __set_forward_AD_enabled(enabled: _bool) -> None: ...
+def __is_forward_AD_enabled() -> _bool: ...
 
 # Defined in torch/csrc/jit/python/script_init.cpp
 class LoggerBase(object):