From 803f7bfaacdd3e18788191a5ced483e8b533806b Mon Sep 17 00:00:00 2001
From: Zino Benaissa <zinob@fb.com>
Date: Wed, 2 Oct 2019 17:26:27 -0700
Subject: [PATCH 001/113] Implement C++ API version of
 torch.nn.functional.one_hot (#27081) (#27177)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27177

Add support for F::one_hot C++ function.

Test Plan:
Added 3 new tests to verify API is working

Imported from OSS

Differential Revision: D17697934

fbshipit-source-id: a8127fb87c00daa119bb92a5702bc4bbba48290d
---
 test/cpp/api/functional.cpp                   | 65 ++++++++++++++++---
 torch/csrc/api/include/torch/nn/functional.h  |  1 +
 .../include/torch/nn/functional/embedding.h   | 12 ++++
 3 files changed, 70 insertions(+), 8 deletions(-)
 create mode 100644 torch/csrc/api/include/torch/nn/functional/embedding.h

diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 5d3f8226183f1..288e7d7ab4c81 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -15,7 +15,7 @@ TEST_F(FunctionalTest, MaxPool1d) {
   auto y = F::max_pool1d(x, MaxPool1dOptions(3).stride(2));
 
   ASSERT_EQ(y.ndimension(), 3);
-  ASSERT_TRUE(torch::allclose(y, torch::ones({1, 1 ,2})));
+  ASSERT_TRUE(torch::allclose(y, torch::ones({1, 1, 2})));
   ASSERT_EQ(y.sizes(), torch::IntArrayRef({1, 1, 2}));
 }
 
@@ -24,7 +24,7 @@ TEST_F(FunctionalTest, MaxPool2d) {
   auto y = F::max_pool2d(x, MaxPool2dOptions(3).stride(2));
 
   ASSERT_EQ(y.ndimension(), 3);
-  ASSERT_TRUE(torch::allclose(y, torch::ones({2, 2 ,2})));
+  ASSERT_TRUE(torch::allclose(y, torch::ones({2, 2, 2})));
   ASSERT_EQ(y.sizes(), torch::IntArrayRef({2, 2, 2}));
 }
 
@@ -67,7 +67,8 @@ TEST_F(FunctionalTest, AvgPool3d) {
 TEST_F(FunctionalTest, CosineSimilarity) {
   auto input1 = torch::tensor({{1, 2, 3}, {4, 5, 6}}, torch::kFloat);
   auto input2 = torch::tensor({{1, 8, 3}, {2, 1, 6}}, torch::kFloat);
-  auto output = F::cosine_similarity(input1, input2, CosineSimilarityOptions().dim(1));
+  auto output =
+      F::cosine_similarity(input1, input2, CosineSimilarityOptions().dim(1));
   auto expected = torch::tensor({0.8078, 0.8721}, torch::kFloat);
   ASSERT_TRUE(output.allclose(expected, 1e-04));
 }
@@ -75,7 +76,8 @@ TEST_F(FunctionalTest, CosineSimilarity) {
 TEST_F(FunctionalTest, PairwiseDistance) {
   auto input1 = torch::tensor({{1, 2, 3}, {4, 5, 6}}, torch::kFloat);
   auto input2 = torch::tensor({{1, 8, 3}, {2, 1, 6}}, torch::kFloat);
-  auto output = F::pairwise_distance(input1, input2, PairwiseDistanceOptions(1));
+  auto output =
+      F::pairwise_distance(input1, input2, PairwiseDistanceOptions(1));
   auto expected = torch::tensor({6, 6}, torch::kFloat);
   ASSERT_TRUE(output.allclose(expected));
 }
@@ -165,15 +167,18 @@ TEST_F(FunctionalTest, MaxUnpool1d) {
   auto y = F::max_unpool1d(x, indices, MaxUnpool1dOptions(3));
 
   ASSERT_EQ(y.ndimension(), 3);
-  ASSERT_TRUE(torch::allclose(y, torch::tensor({{{0, 2, 0, 4, 5, 0, 0, 0, 0}}}, torch::kFloat)));
+  ASSERT_TRUE(torch::allclose(
+      y, torch::tensor({{{0, 2, 0, 4, 5, 0, 0, 0, 0}}}, torch::kFloat)));
   ASSERT_EQ(y.sizes(), torch::IntArrayRef({1, 1, 9}));
 
   x = torch::tensor({{{2, 4, 5}}}, torch::requires_grad());
   indices = torch::tensor({{{1, 3, 4}}}, torch::kLong);
-  y = F::max_unpool1d(x, indices, MaxUnpool1dOptions(3), c10::IntArrayRef({1, 1, 9}));
+  y = F::max_unpool1d(
+      x, indices, MaxUnpool1dOptions(3), c10::IntArrayRef({1, 1, 9}));
 
   ASSERT_EQ(y.ndimension(), 3);
-  ASSERT_TRUE(torch::allclose(y, torch::tensor({{{0, 2, 0, 4, 5, 0, 0, 0, 0}}}, torch::kFloat)));
+  ASSERT_TRUE(torch::allclose(
+      y, torch::tensor({{{0, 2, 0, 4, 5, 0, 0, 0, 0}}}, torch::kFloat)));
   ASSERT_EQ(y.sizes(), torch::IntArrayRef({1, 1, 9}));
 
   x = torch::tensor({{{2, 4, 5}}}, torch::requires_grad());
@@ -181,7 +186,8 @@ TEST_F(FunctionalTest, MaxUnpool1d) {
   y = F::max_unpool1d(x, indices, MaxUnpool1dOptions(3).stride(2).padding(1));
 
   ASSERT_EQ(y.ndimension(), 3);
-  ASSERT_TRUE(torch::allclose(y, torch::tensor({{{0, 2, 0, 4, 5}}}, torch::kFloat)));
+  ASSERT_TRUE(
+      torch::allclose(y, torch::tensor({{{0, 2, 0, 4, 5}}}, torch::kFloat)));
   ASSERT_EQ(y.sizes(), torch::IntArrayRef({1, 1, 5}));
 }
 
@@ -254,3 +260,46 @@ TEST_F(FunctionalTest, Hardshrink) {
     ASSERT_TRUE(torch::allclose(y, y_exp));
   }
 }
+
+TEST_F(FunctionalTest, OneHot) {
+  { // Test #1
+    auto x = torch::arange(0, 5, torch::kLong);
+    auto y = F::one_hot(x % 3);
+    auto expected = torch::tensor(
+        {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}, {1, 0, 0}, {0, 1, 0}}, torch::kLong);
+
+    ASSERT_EQ(y.ndimension(), 2);
+    ASSERT_TRUE(torch::allclose(y, expected));
+    ASSERT_EQ(y.sizes(), torch::IntArrayRef({5, 3}));
+  }
+
+  { // Test #2
+    auto x = torch::arange(0, 5, torch::kLong);
+    auto y = F::one_hot(x % 3, 5);
+    auto expected = torch::tensor(
+        {{1, 0, 0, 0, 0},
+         {0, 1, 0, 0, 0},
+         {0, 0, 1, 0, 0},
+         {1, 0, 0, 0, 0},
+         {0, 1, 0, 0, 0}},
+        torch::kLong);
+
+    ASSERT_EQ(y.ndimension(), 2);
+    ASSERT_TRUE(torch::allclose(y, expected));
+    ASSERT_EQ(y.sizes(), torch::IntArrayRef({5, 5}));
+  }
+
+  { // Test #3
+    auto x = torch::arange(0, 6, torch::kLong);
+    auto y = F::one_hot(x.view(torch::IntArrayRef({3, 2})) % 3);
+    auto expected = torch::tensor(
+        {{{1, 0, 0}, {0, 1, 0}},
+         {{0, 0, 1}, {1, 0, 0}},
+         {{0, 1, 0}, {0, 0, 1}}},
+        torch::kLong);
+
+    ASSERT_EQ(y.ndimension(), 3);
+    ASSERT_TRUE(torch::allclose(y, expected));
+    ASSERT_EQ(y.sizes(), torch::IntArrayRef({3, 2, 3}));
+  }
+}
diff --git a/torch/csrc/api/include/torch/nn/functional.h b/torch/csrc/api/include/torch/nn/functional.h
index 062db965fb01e..39b3eed642736 100644
--- a/torch/csrc/api/include/torch/nn/functional.h
+++ b/torch/csrc/api/include/torch/nn/functional.h
@@ -3,3 +3,4 @@
 #include <torch/nn/functional/distance.h>
 #include <torch/nn/functional/loss.h>
 #include <torch/nn/functional/pooling.h>
+#include <torch/nn/functional/embedding.h>
diff --git a/torch/csrc/api/include/torch/nn/functional/embedding.h b/torch/csrc/api/include/torch/nn/functional/embedding.h
new file mode 100644
index 0000000000000..7003239f1ac4c
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/functional/embedding.h
@@ -0,0 +1,12 @@
+#pragma once
+
+namespace torch {
+namespace nn {
+namespace functional {
+
+inline Tensor one_hot(const Tensor& tensor, int64_t num_classes = -1) {
+  return torch::one_hot(tensor, num_classes);
+}
+} // namespace functional
+} // namespace nn
+} // namespace torch

From 529c2dfaa9cdd0406ebdb3c34c0c908dceb9c76d Mon Sep 17 00:00:00 2001
From: Mikhail Zolotukhin <mvz@fb.com>
Date: Wed, 2 Oct 2019 17:31:43 -0700
Subject: [PATCH 002/113] Refactor torch::jit::script::Module::register_* API.
 (#27189)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27189

Conceptually, Module is just a view over ClassType and ivalue::object.
register_ methods are the only methods that are exception from this:
they provide an API not available on ClassType or object directly. This
PR ports this API to ClassType and makes Module truly just a view over
those two.

Test Plan: Imported from OSS

Differential Revision: D17703533

Pulled By: ZolotukhinM

fbshipit-source-id: 2cdb9fb486b3fb8527986483c7f34be7bd59fabf
---
 aten/src/ATen/core/jit_type.h    | 22 ++++++++++++
 torch/csrc/jit/script/module.cpp |  8 ++---
 torch/csrc/jit/script/module.h   | 61 +++++++++++---------------------
 torch/csrc/jit/script/slot.h     |  3 ++
 4 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index dc40d61a5160f..a26371b559d42 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -1458,6 +1458,28 @@ struct CAFFE2_API ClassType : public NamedType {
       TypePtr type,
       bool is_parameter = false);
 
+  // Add attribute \p NAME if it doesn't exist or verify that it has a
+  // compatible type otherwise.
+  size_t addOrCheckAttribute(
+      const std::string& name,
+      TypePtr ty,
+      bool is_parameter = false) {
+    auto slot_idx = findAttributeSlot(name);
+    if (!slot_idx) {
+      return addAttribute(name, ty, is_parameter);
+    }
+
+    TORCH_CHECK(
+        is_parameter == this->is_parameter(*slot_idx),
+        "Parameter field mismatch for the field '",
+        name,
+        "'");
+    TypePtr atype = getAttribute(*slot_idx);
+    TORCH_CHECK(ty->isSubtypeOf(atype));
+    return *slot_idx;
+  }
+
+
   at::ArrayRef<std::string> attributeNames() const {
     return attributeNames_;
   }
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
index 0ebc35859e8f5..81544f5c89df5 100644
--- a/torch/csrc/jit/script/module.cpp
+++ b/torch/csrc/jit/script/module.cpp
@@ -334,13 +334,9 @@ Module Module::clone_impl(
       const Module& orig = s.to_module();
       Module cloned = orig.clone_impl(type_remap);
       type_remap[orig.type()] = cloned.type();
-      r.set_or_add_slot(
-          s.name(),
-          type_remap.at(s.type()),
-          cloned.module_object(),
-          s.entity_type());
+      r.register_module(s.name(), cloned);
     } else {
-      r.set_or_add_slot(s.name(), s.type(), s.value(), s.entity_type());
+      r.register_attribute(s.name(), s.type(), s.value(), s.is_parameter());
     }
   }
 
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 35a5a80b733ab..17088b76ac514 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -144,28 +144,27 @@ struct TORCH_API Module {
   // register_buffer method. With this simplification, we only need to track
   // whether a slot is a parameter to be able to classify it.
   void register_buffer(const std::string& name, autograd::Variable v) {
-    set_or_add_slot(name, TensorType::get(), v, EntityType::ATTRIBUTE);
+    type()->addOrCheckAttribute(name, TensorType::get());
+    module_object()->setAttr(name, v);
   }
-
   void register_parameter(
       const std::string& name,
       autograd::Variable v,
       bool is_buffer) {
-    set_or_add_slot(
-        name,
-        TensorType::get(),
-        v,
-        is_buffer ? EntityType::ATTRIBUTE : EntityType::PARAMETER);
+    type()->addOrCheckAttribute(name, TensorType::get(), !is_buffer);
+    module_object()->setAttr(name, v);
   }
   void register_attribute(
       const std::string& name,
-      const TypePtr type,
-      IValue ivalue) {
-    set_or_add_slot(name, type, ivalue, EntityType::ATTRIBUTE);
+      const TypePtr t,
+      IValue v,
+      bool is_param = false) {
+    type()->addOrCheckAttribute(name, t, is_param);
+    module_object()->setAttr(name, v);
   }
   void register_module(const std::string& name, const Module& module) {
-    set_or_add_slot(
-        name, module.type(), module.module_object(), EntityType::MODULE);
+    type()->addOrCheckAttribute(name, module.type());
+    module_object()->setAttr(name, module.module_object());
   }
 
   void set_parameter(const std::string& name, at::Tensor v) {
@@ -396,40 +395,20 @@ struct TORCH_API Module {
     }
     return nullptr;
   }
-  void check_entity(EntityType expected, size_t slot) const {
-    EntityType actual = get_slot(slot).entity_type();
+
+  Slot get_slot(const std::string& name, EntityType etype) const {
+    size_t slot_idx = type()->getAttributeSlot(name);
+    Slot slot = get_slot(slot_idx);
     TORCH_CHECK(
-        expected == actual,
+        etype == slot.entity_type(),
         "The field '",
-        type()->getAttributeName(slot),
+        type()->getAttributeName(slot_idx),
         "' is a ",
-        toString(actual),
+        toString(slot.entity_type()),
         " but this call is"
         " trying to use it as a ",
-        toString(expected));
-  }
-
-  void set_or_add_slot(
-      const std::string& name,
-      const TypePtr& slot_type,
-      IValue v,
-      EntityType etype) {
-    auto slot = type()->findAttributeSlot(name);
-    if (!slot) {
-      slot =
-          type()->addAttribute(name, slot_type, etype == EntityType::PARAMETER);
-    } else {
-      check_entity(etype, *slot);
-    }
-    TypePtr atype = type()->getAttribute(*slot);
-    TORCH_CHECK(slot_type->isSubtypeOf(atype));
-    module_object()->setSlot(*slot, std::move(v));
-  }
-
-  Slot get_slot(const std::string& name, EntityType etype) const {
-    size_t slot = type()->getAttributeSlot(name);
-    check_entity(etype, slot);
-    return get_slot(slot);
+        toString(etype));
+    return slot;
   }
   c10::optional<Slot> find_slot(const std::string& name, EntityType etype)
       const {
diff --git a/torch/csrc/jit/script/slot.h b/torch/csrc/jit/script/slot.h
index def1710492e9c..df3c71fb676e4 100644
--- a/torch/csrc/jit/script/slot.h
+++ b/torch/csrc/jit/script/slot.h
@@ -47,6 +47,9 @@ struct TORCH_API Slot {
   bool is_module() const {
     return entity_type() == EntityType::MODULE;
   }
+  bool is_parameter() const {
+    return container_->type()->is_parameter(offset_);
+  }
 
   Module to_module() const;
   bool operator==(const Slot& rhs) const {

From 4fb442985a6c53bbba9b1d27601a2ab69299d913 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Wed, 2 Oct 2019 18:03:05 -0700
Subject: [PATCH 003/113] Add c10_experimental ops to BC check white list
 (#27235)

Summary:
experimental ops doesn't provide bc guarantee.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27235

Reviewed By: hl475

Differential Revision: D17723292

Pulled By: houseroad

fbshipit-source-id: 644ae34d130418a810e0f9d802fa25f6e34c5ccf
---
 test/backward_compatibility/check_backward_compatibility.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py
index f18438244334d..e1115f012e979 100644
--- a/test/backward_compatibility/check_backward_compatibility.py
+++ b/test/backward_compatibility/check_backward_compatibility.py
@@ -12,6 +12,7 @@
     ('quantize', datetime.date(2019, 10, 1)),
     ('q_per_channel_axis', datetime.date(2019, 10, 1)),
     ('fbgemm_is_cpu_supported', datetime.date(2019, 10, 1)),
+    ('c10_experimental', datetime.date(2020, 1, 1)),
 ]
 
 

From 27dc5952159031348ccca8b89aa2b465fc311e72 Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <cc.rafaz@zafar.cc>
Date: Wed, 2 Oct 2019 18:49:47 -0700
Subject: [PATCH 004/113] Rename _intrinsic to intrinsic

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27194

Test Plan: Imported from OSS

Differential Revision: D17704957

Pulled By: zafartahirov

fbshipit-source-id: 46f02d129aa77c3047b2a6c606bfadd831a6b0fc
---
 test/test_qat.py                                          | 6 +++---
 test/test_quantization.py                                 | 8 ++++----
 test/test_quantized_models.py                             | 4 ++--
 test/test_quantized_nn_mods.py                            | 6 +++---
 torch/__init__.py                                         | 2 +-
 torch/nn/{_intrinsic => intrinsic}/__init__.py            | 0
 torch/nn/{_intrinsic => intrinsic}/modules/__init__.py    | 0
 torch/nn/{_intrinsic => intrinsic}/modules/fused.py       | 0
 torch/nn/{_intrinsic => intrinsic}/qat/__init__.py        | 0
 .../nn/{_intrinsic => intrinsic}/qat/modules/__init__.py  | 0
 .../{_intrinsic => intrinsic}/qat/modules/conv_fused.py   | 8 ++++----
 .../{_intrinsic => intrinsic}/qat/modules/linear_relu.py  | 6 +++---
 torch/nn/{_intrinsic => intrinsic}/quantized/__init__.py  | 0
 .../quantized/modules/__init__.py                         | 0
 .../quantized/modules/conv_relu.py                        | 8 ++++----
 .../quantized/modules/linear_relu.py                      | 6 +++---
 torch/nn/qat/modules/conv.py                              | 2 +-
 torch/nn/qat/modules/linear.py                            | 2 +-
 torch/nn/quantized/modules/conv.py                        | 4 ++--
 torch/nn/quantized/modules/linear.py                      | 4 ++--
 torch/quantization/fuse_modules.py                        | 8 ++++----
 torch/quantization/quantize.py                            | 6 +++---
 22 files changed, 40 insertions(+), 40 deletions(-)
 rename torch/nn/{_intrinsic => intrinsic}/__init__.py (100%)
 rename torch/nn/{_intrinsic => intrinsic}/modules/__init__.py (100%)
 rename torch/nn/{_intrinsic => intrinsic}/modules/fused.py (100%)
 rename torch/nn/{_intrinsic => intrinsic}/qat/__init__.py (100%)
 rename torch/nn/{_intrinsic => intrinsic}/qat/modules/__init__.py (100%)
 rename torch/nn/{_intrinsic => intrinsic}/qat/modules/conv_fused.py (98%)
 rename torch/nn/{_intrinsic => intrinsic}/qat/modules/linear_relu.py (88%)
 rename torch/nn/{_intrinsic => intrinsic}/quantized/__init__.py (100%)
 rename torch/nn/{_intrinsic => intrinsic}/quantized/modules/__init__.py (100%)
 rename torch/nn/{_intrinsic => intrinsic}/quantized/modules/conv_relu.py (91%)
 rename torch/nn/{_intrinsic => intrinsic}/quantized/modules/linear_relu.py (88%)

diff --git a/test/test_qat.py b/test/test_qat.py
index 1ea91558ff343..14b7639a8404c 100644
--- a/test/test_qat.py
+++ b/test/test_qat.py
@@ -5,7 +5,7 @@
 
 import torch
 from torch.nn import Conv2d, BatchNorm2d, ReLU
-from torch.nn._intrinsic.qat import ConvBn2d, ConvBnReLU2d
+from torch.nn.intrinsic.qat import ConvBn2d, ConvBnReLU2d
 from torch.quantization.QConfig import default_qat_qconfig
 import torch.backends.mkldnn
 from common_utils import TestCase, run_tests
@@ -99,9 +99,9 @@ def test_conv_bn_relu(
             ).to(dtype=torch.double)
             qat_op.apply(torch.quantization.disable_fake_quant)
             if freeze_bn:
-                qat_op.apply(torch.nn._intrinsic.qat.freeze_bn_stats)
+                qat_op.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
             else:
-                qat_op.apply(torch.nn._intrinsic.qat.update_bn_stats)
+                qat_op.apply(torch.nn.intrinsic.qat.update_bn_stats)
 
             # align inputs and internal parameters
             input = torch.randn(batch_size, input_channels, height, width, dtype=torch.double, requires_grad=True)
diff --git a/test/test_quantization.py b/test/test_quantization.py
index a72e1c8173915..081342eb4f315 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -3,9 +3,9 @@
 import torch
 import torch.nn as nn
 import torch.nn.quantized as nnq
-import torch.nn._intrinsic as nni
-import torch.nn._intrinsic.quantized as nniq
-import torch.nn._intrinsic.qat as nniqat
+import torch.nn.intrinsic as nni
+import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.qat as nniqat
 from torch.quantization import \
     QConfig, QConfigDynamic, default_observer, default_weight_observer, get_observer_dict,\
     quantize, prepare, convert, prepare_qat, quantize_qat, fuse_modules, \
@@ -294,7 +294,7 @@ def test_resnet_base(self, qconfig):
         model = convert(model)
 
         def checkQuantized(model):
-            self.assertEqual(type(model.module.conv1), nn._intrinsic.quantized.ConvReLU2d)
+            self.assertEqual(type(model.module.conv1), nn.intrinsic.quantized.ConvReLU2d)
             self.assertEqual(type(model.module.myop), nn.quantized.QFunctional)
             self.assertEqual(type(model.module.avgpool), nn.AdaptiveAvgPool2d)
             test_only_eval_fn(model, self.img_data)
diff --git a/test/test_quantized_models.py b/test/test_quantized_models.py
index cf0f41655d550..2fcf2c17f9233 100644
--- a/test/test_quantized_models.py
+++ b/test/test_quantized_models.py
@@ -75,7 +75,7 @@ def test_fake_quant_true_quant_compare(self):
                 torch.quantization.prepare_qat(fq_model)
                 fq_model.eval()
                 fq_model.apply(torch.quantization.disable_fake_quant)
-                fq_model.apply(torch.nn._intrinsic.qat.freeze_bn_stats)
+                fq_model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
                 fq_model(calib_data)
                 fq_model.apply(torch.quantization.enable_fake_quant)
                 fq_model.apply(torch.quantization.disable_observer)
@@ -116,7 +116,7 @@ def test_weight_only_activation_only_fakequant(self):
                     torch.quantization.prepare_qat(fq_model)
                     fq_model.eval()
                     fq_model.apply(torch.quantization.disable_fake_quant)
-                    fq_model.apply(torch.nn._intrinsic.qat.freeze_bn_stats)
+                    fq_model.apply(torch.nn.intrinsic.qat.freeze_bn_stats)
                     fq_model(calib_data)
                     fq_model.apply(torch.quantization.enable_fake_quant)
                     fq_model.apply(torch.quantization.disable_observer)
diff --git a/test/test_quantized_nn_mods.py b/test/test_quantized_nn_mods.py
index e647873afc837..afa3f423a6421 100644
--- a/test/test_quantized_nn_mods.py
+++ b/test/test_quantized_nn_mods.py
@@ -1,10 +1,10 @@
 import torch
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
-import torch.nn._intrinsic.quantized as nnq_fused
+import torch.nn.intrinsic.quantized as nnq_fused
 import torch.nn.quantized.functional as qF
 from torch.nn.quantized.modules import Conv2d
-from torch.nn._intrinsic.quantized import ConvReLU2d
+from torch.nn.intrinsic.quantized import ConvReLU2d
 import torch.quantization
 from common_utils import run_tests, IS_PPC, TEST_WITH_UBSAN
 from common_quantization import QuantizationTestCase, prepare_dynamic
@@ -228,7 +228,7 @@ def test_relu(self):
         qengine=st.sampled_from(("qnnpack", "fbgemm"))
     )
     def test_linear_api(self, batch_size, in_features, out_features, use_bias, use_fused, per_channel, qengine):
-        """test API functionality for nn.quantized.linear and nn._intrinsic.quantized.linear_relu"""
+        """test API functionality for nn.quantized.linear and nn.intrinsic.quantized.linear_relu"""
         if qengine not in torch.backends.quantized.supported_engines:
             return
         if qengine == 'qnnpack':
diff --git a/torch/__init__.py b/torch/__init__.py
index 88130b4cc3573..a36171d6d2e04 100644
--- a/torch/__init__.py
+++ b/torch/__init__.py
@@ -303,7 +303,7 @@ def manager_path():
 import torch.autograd
 from torch.autograd import no_grad, enable_grad, set_grad_enabled  # noqa: F401
 import torch.nn
-import torch.nn._intrinsic
+import torch.nn.intrinsic
 import torch.nn.quantized
 import torch.optim
 import torch.multiprocessing
diff --git a/torch/nn/_intrinsic/__init__.py b/torch/nn/intrinsic/__init__.py
similarity index 100%
rename from torch/nn/_intrinsic/__init__.py
rename to torch/nn/intrinsic/__init__.py
diff --git a/torch/nn/_intrinsic/modules/__init__.py b/torch/nn/intrinsic/modules/__init__.py
similarity index 100%
rename from torch/nn/_intrinsic/modules/__init__.py
rename to torch/nn/intrinsic/modules/__init__.py
diff --git a/torch/nn/_intrinsic/modules/fused.py b/torch/nn/intrinsic/modules/fused.py
similarity index 100%
rename from torch/nn/_intrinsic/modules/fused.py
rename to torch/nn/intrinsic/modules/fused.py
diff --git a/torch/nn/_intrinsic/qat/__init__.py b/torch/nn/intrinsic/qat/__init__.py
similarity index 100%
rename from torch/nn/_intrinsic/qat/__init__.py
rename to torch/nn/intrinsic/qat/__init__.py
diff --git a/torch/nn/_intrinsic/qat/modules/__init__.py b/torch/nn/intrinsic/qat/modules/__init__.py
similarity index 100%
rename from torch/nn/_intrinsic/qat/modules/__init__.py
rename to torch/nn/intrinsic/qat/modules/__init__.py
diff --git a/torch/nn/_intrinsic/qat/modules/conv_fused.py b/torch/nn/intrinsic/qat/modules/conv_fused.py
similarity index 98%
rename from torch/nn/_intrinsic/qat/modules/conv_fused.py
rename to torch/nn/intrinsic/qat/modules/conv_fused.py
index dab0bf75b6f28..6356f066fbb4f 100644
--- a/torch/nn/_intrinsic/qat/modules/conv_fused.py
+++ b/torch/nn/intrinsic/qat/modules/conv_fused.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import torch
 import torch.nn as nn
-import torch.nn._intrinsic
+import torch.nn.intrinsic
 import torch.nn.qat as nnqat
 import torch.nn.functional as F
 from torch.nn import init
@@ -28,7 +28,7 @@ class ConvBn2d(nn.Conv2d):
         weight_fake_quant: fake quant module for weight
 
     """
-    _FLOAT_MODULE = torch.nn._intrinsic.ConvBn2d
+    _FLOAT_MODULE = torch.nn.intrinsic.ConvBn2d
 
     def __init__(self,
                  # Conv2d args
@@ -192,7 +192,7 @@ class ConvBnReLU2d(ConvBn2d):
         weight_fake_quant: fake quant module for weight
 
     """
-    _FLOAT_MODULE = torch.nn._intrinsic.ConvBnReLU2d
+    _FLOAT_MODULE = torch.nn.intrinsic.ConvBnReLU2d
 
     def __init__(self,
                  # Conv2d args
@@ -236,7 +236,7 @@ class ConvReLU2d(nnqat.Conv2d):
         weight_fake_quant: fake quant module for weight
 
     """
-    _FLOAT_MODULE = torch.nn._intrinsic.ConvReLU2d
+    _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU2d
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1,
diff --git a/torch/nn/_intrinsic/qat/modules/linear_relu.py b/torch/nn/intrinsic/qat/modules/linear_relu.py
similarity index 88%
rename from torch/nn/_intrinsic/qat/modules/linear_relu.py
rename to torch/nn/intrinsic/qat/modules/linear_relu.py
index 04037555483af..a9c4db6db1b99 100644
--- a/torch/nn/_intrinsic/qat/modules/linear_relu.py
+++ b/torch/nn/intrinsic/qat/modules/linear_relu.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import torch.nn.qat as nnqat
-import torch.nn._intrinsic
+import torch.nn.intrinsic
 import torch.nn.functional as F
 
 class LinearReLU(nnqat.Linear):
@@ -11,7 +11,7 @@ class LinearReLU(nnqat.Linear):
 
     We adopt the same interface as :class:`torch.nn.Linear`.
 
-    Similar to `torch.nn._intrinsic.LinearReLU`, with FakeQuantize modules initialized to
+    Similar to `torch.nn.intrinsic.LinearReLU`, with FakeQuantize modules initialized to
     default.
 
     Attributes:
@@ -27,7 +27,7 @@ class LinearReLU(nnqat.Linear):
         >>> print(output.size())
         torch.Size([128, 30])
     """
-    _FLOAT_MODULE = torch.nn._intrinsic.LinearReLU
+    _FLOAT_MODULE = torch.nn.intrinsic.LinearReLU
 
     def __init__(self, in_features, out_features, bias=True,
                  qconfig=None):
diff --git a/torch/nn/_intrinsic/quantized/__init__.py b/torch/nn/intrinsic/quantized/__init__.py
similarity index 100%
rename from torch/nn/_intrinsic/quantized/__init__.py
rename to torch/nn/intrinsic/quantized/__init__.py
diff --git a/torch/nn/_intrinsic/quantized/modules/__init__.py b/torch/nn/intrinsic/quantized/modules/__init__.py
similarity index 100%
rename from torch/nn/_intrinsic/quantized/modules/__init__.py
rename to torch/nn/intrinsic/quantized/modules/__init__.py
diff --git a/torch/nn/_intrinsic/quantized/modules/conv_relu.py b/torch/nn/intrinsic/quantized/modules/conv_relu.py
similarity index 91%
rename from torch/nn/_intrinsic/quantized/modules/conv_relu.py
rename to torch/nn/intrinsic/quantized/modules/conv_relu.py
index 53fca2781781b..1a5d70d8a6dbb 100644
--- a/torch/nn/_intrinsic/quantized/modules/conv_relu.py
+++ b/torch/nn/intrinsic/quantized/modules/conv_relu.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import torch.nn.quantized as nnq
-import torch.nn._intrinsic
-import torch.nn._intrinsic.qat
+import torch.nn.intrinsic
+import torch.nn.intrinsic.qat
 from torch.nn.utils import fuse_conv_bn_weights
 import torch
 
@@ -15,7 +15,7 @@ class ConvReLU2d(nnq.Conv2d):
         Same as torch.nn.quantized.Conv2d
 
     """
-    _FLOAT_MODULE = torch.nn._intrinsic.ConvReLU2d
+    _FLOAT_MODULE = torch.nn.intrinsic.ConvReLU2d
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                  padding=0, dilation=1, groups=1,
@@ -37,7 +37,7 @@ def forward(self, input):
 
     @classmethod
     def from_float(cls, mod):
-        if type(mod) == torch.nn._intrinsic.qat.ConvBnReLU2d:
+        if type(mod) == torch.nn.intrinsic.qat.ConvBnReLU2d:
             mod.weight, mod.bias = \
                 fuse_conv_bn_weights(mod.weight, mod.bias, mod.running_mean,
                                      mod.running_var, mod.eps, mod.gamma, mod.beta)
diff --git a/torch/nn/_intrinsic/quantized/modules/linear_relu.py b/torch/nn/intrinsic/quantized/modules/linear_relu.py
similarity index 88%
rename from torch/nn/_intrinsic/quantized/modules/linear_relu.py
rename to torch/nn/intrinsic/quantized/modules/linear_relu.py
index e18686dba6f55..5faf8347c3fca 100644
--- a/torch/nn/_intrinsic/quantized/modules/linear_relu.py
+++ b/torch/nn/intrinsic/quantized/modules/linear_relu.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import torch.nn.quantized as nnq
-import torch.nn._intrinsic
+import torch.nn.intrinsic
 import torch
 
 class LinearReLU(nnq.Linear):
@@ -14,13 +14,13 @@ class LinearReLU(nnq.Linear):
 
     Examples::
 
-        >>> m = nn._intrinsic.LinearReLU(20, 30)
+        >>> m = nn.intrinsic.LinearReLU(20, 30)
         >>> input = torch.randn(128, 20)
         >>> output = m(input)
         >>> print(output.size())
         torch.Size([128, 30])
     """
-    _FLOAT_MODULE = torch.nn._intrinsic.LinearReLU
+    _FLOAT_MODULE = torch.nn.intrinsic.LinearReLU
 
     def __init__(self, in_features, out_features, bias=True):
         super(LinearReLU, self).__init__(in_features, out_features, bias)
diff --git a/torch/nn/qat/modules/conv.py b/torch/nn/qat/modules/conv.py
index d6f489d297375..0338738654b7c 100644
--- a/torch/nn/qat/modules/conv.py
+++ b/torch/nn/qat/modules/conv.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 from torch.nn import Conv2d as NNConv2d
-from torch.nn._intrinsic import ConvReLU2d
+from torch.nn.intrinsic import ConvReLU2d
 
 class Conv2d(NNConv2d):
     r"""
diff --git a/torch/nn/qat/modules/linear.py b/torch/nn/qat/modules/linear.py
index 192f5cab0b33e..6ab87d6706781 100644
--- a/torch/nn/qat/modules/linear.py
+++ b/torch/nn/qat/modules/linear.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn._intrinsic import LinearReLU
+from torch.nn.intrinsic import LinearReLU
 
 class Linear(nn.Linear):
     r"""
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index b31c314e774fc..fa0f166c005e4 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -8,8 +8,8 @@
 
 import torch
 import torch.nn as nn
-import torch.nn._intrinsic as nni
-import torch.nn._intrinsic.qat as nniqat
+import torch.nn.intrinsic as nni
+import torch.nn.intrinsic.qat as nniqat
 from torch.nn.utils import fuse_conv_bn_weights
 from torch._ops import ops
 from torch.nn.modules.utils import _pair
diff --git a/torch/nn/quantized/modules/linear.py b/torch/nn/quantized/modules/linear.py
index 3ff3d6f5e79cd..9220fcb5a415c 100644
--- a/torch/nn/quantized/modules/linear.py
+++ b/torch/nn/quantized/modules/linear.py
@@ -2,9 +2,9 @@
 
 import torch
 
-from torch._jit_internal import Optional
+from torch._jit_internal import Optional  # noqa: F401
 import torch.nn as nn
-import torch.nn._intrinsic as nni
+import torch.nn.intrinsic as nni
 from torch.nn.modules import Module
 from torch.nn.quantized.modules.utils import _quantize_weight
 
diff --git a/torch/quantization/fuse_modules.py b/torch/quantization/fuse_modules.py
index 6784994a2cb77..612af215ff07d 100644
--- a/torch/quantization/fuse_modules.py
+++ b/torch/quantization/fuse_modules.py
@@ -3,7 +3,7 @@
 import torch
 import copy
 
-import torch.nn._intrinsic.modules.fused as torch_fused
+import torch.nn.intrinsic.modules.fused as torch_fused
 
 def fuse_conv_bn(conv, bn):
     r"""Given the conv and bn modules, fuses them and returns the fused module
@@ -26,7 +26,7 @@ def fuse_conv_bn(conv, bn):
         assert bn.num_features == conv.out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'
         assert bn.affine, 'Only support fusing BatchNorm2d with affine set to True'
         assert bn.track_running_stats, 'Only support fusing BatchNorm2d with tracking_running_stats set to True'
-        return torch.nn._intrinsic.ConvBn2d(conv, bn)
+        return torch.nn.intrinsic.ConvBn2d(conv, bn)
     else:
         return torch.nn.utils.fuse_conv_bn_eval(conv, bn)
 
@@ -87,8 +87,8 @@ def fuse_known_modules(mod_list):
     OP_LIST_TO_FUSER_METHOD = {
         (torch.nn.Conv2d, torch.nn.BatchNorm2d): fuse_conv_bn,
         (torch.nn.Conv2d, torch.nn.BatchNorm2d, torch.nn.ReLU): fuse_conv_bn_relu,
-        (torch.nn.Conv2d, torch.nn.ReLU): torch.nn._intrinsic.ConvReLU2d,
-        (torch.nn.Linear, torch.nn.ReLU): torch.nn._intrinsic.LinearReLU
+        (torch.nn.Conv2d, torch.nn.ReLU): torch.nn.intrinsic.ConvReLU2d,
+        (torch.nn.Linear, torch.nn.ReLU): torch.nn.intrinsic.LinearReLU
     }
 
     types = tuple(type(m) for m in mod_list)
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index f1b1828820a7c..efb0949833349 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -3,9 +3,9 @@
 import copy
 import torch
 import torch.nn as nn
-import torch.nn._intrinsic as nni
-import torch.nn._intrinsic.quantized as nniq
-import torch.nn._intrinsic.qat as nniqat
+import torch.nn.intrinsic as nni
+import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.qat as nniqat
 import torch.nn.quantized as nnq
 import torch.nn.quantized.dynamic as nnqd
 from .QConfig import default_dynamic_qconfig, float16_dynamic_qconfig

From 1affa7c32c0a85f46bfad1f0cc65bede8df77f27 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 2 Oct 2019 19:53:40 -0700
Subject: [PATCH 005/113] Allow set for qconfig for dynamic_quantize

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27181

Test Plan: Imported from OSS

Differential Revision: D17717482

Pulled By: jamesr66a

fbshipit-source-id: f3930fc87831cbdcf4390cd769c594bb13f5cd81
---
 test/test_quantization.py      | 20 +++++++++++++++++++
 torch/quantization/quantize.py | 36 ++++++++++++++++++++++------------
 2 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/test/test_quantization.py b/test/test_quantization.py
index 081342eb4f315..c141ad605eec1 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -336,6 +336,11 @@ def checkQuantized(model):
         quantize_dynamic(model, qconfig_dict, inplace=True)
         checkQuantized(model)
 
+        # Test set qconfig
+        model = SingleLayerLinearDynamicModel()
+        quantize_dynamic(model, set([nn.Linear]), inplace=True)
+        checkQuantized(model)
+
     def test_two_layers(self):
         r"""TwoLayerLinearModel has two Linear modules but we only quantize the second one
         `fc2`, and `fc1`is not quantized
@@ -359,6 +364,10 @@ def checkQuantized(model):
         model = quantize_dynamic(TwoLayerLinearModel().eval(), qconfig_dict)
         checkQuantized(model)
 
+        # Test set API
+        model = quantize_dynamic(TwoLayerLinearModel().eval(), {'fc2'})
+        checkQuantized(model)
+
     def test_nested1(self):
         r"""Test quantization for nested model, top level 'fc3' and
         'fc1' of submodule 'sub2', 'sub2.fc2' is not quantized
@@ -385,6 +394,9 @@ def checkQuantized(model):
         model = quantize_dynamic(NestedModel().eval(), qconfig_dict)
         checkQuantized(model)
 
+        model = quantize_dynamic(NestedModel().eval(), {'fc3', 'sub2.fc1'})
+        checkQuantized(model)
+
     def test_nested2(self):
         r"""Another test case for quantized, we will quantize all submodules
         of submodule sub2
@@ -412,6 +424,10 @@ def checkQuantized(model):
         model = quantize_dynamic(NestedModel().eval(), qconfig_dict)
         checkQuantized(model)
 
+        # Test set API
+        model = quantize_dynamic(NestedModel().eval(), {'fc3', 'sub2'})
+        checkQuantized(model)
+
     def test_nested3(self):
         r"""More complicated nested test case with child qconfig overrides
         parent qconfig
@@ -443,6 +459,10 @@ def checkQuantized(model):
         model = quantize_dynamic(NestedModel().eval(), qconfig_dynamic_dict)
         checkQuantized(model)
 
+        # Test set API
+        model = quantize_dynamic(NestedModel().eval(), {'fc3', 'sub2', 'sub2.fc1'})
+        checkQuantized(model)
+
     def test_type_match_rule(self):
         r"""Test quantization for nested model, top level 'fc3' and
         'fc1' of submodule 'sub2', All 'torch.nn.Linear' modules are quantized
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index efb0949833349..f38187f6e1975 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import copy
+import itertools
 import torch
 import torch.nn as nn
 import torch.nn.intrinsic as nni
@@ -270,7 +271,7 @@ def quantize(model, run_fn, run_args, mapping=DEFAULT_MODULE_MAPPING, inplace=Fa
     convert(model, mapping, inplace=True)
     return model
 
-def quantize_dynamic(model, qconfig_dict=None, dtype=torch.qint8, mapping=DEFAULT_DYNAMIC_MODULE_MAPPING, inplace=False):
+def quantize_dynamic(model, qconfig=None, dtype=torch.qint8, mapping=DEFAULT_DYNAMIC_MODULE_MAPPING, inplace=False):
     r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
 
     Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
@@ -278,28 +279,31 @@ def quantize_dynamic(model, qconfig_dict=None, dtype=torch.qint8, mapping=DEFAUL
     For simplest usage provide `dtype` argument that can be float16 or qint8. Weight-only quantization
     by default is performed for layers with large weights size - i.e. Linear and RNN variants.
 
-    Fine grained control is possible with `qconfig_dict` and `mapping` that act similarly to `quantize()`.
-    If `qconfig_dict` is provided, the `dtype` argument is ignored.
+    Fine grained control is possible with `qconfig` and `mapping` that act similarly to `quantize()`.
+    If `qconfig` is provided, the `dtype` argument is ignored.
 
     Args:
         module: input model
-        qconfig_dict: dictionary that maps from name or type of submodule to quantization
-            configuration, qconfig applies to all submodules of a given
-            module unless qconfig for the submodules are specified (when the
-            submodule already has qconfig attribute). Entries in the dictionary
-            need to be QConfigDynamic instances.
+        qconfig: Either:
+            * A dictionary that maps from name or type of submodule to quantization
+              configuration, qconfig applies to all submodules of a given
+              module unless qconfig for the submodules are specified (when the
+              submodule already has qconfig attribute). Entries in the dictionary
+              need to be QConfigDynamic instances.
+            * A set of types and/or submodule names to apply dynamic quantization to,
+              in which case the `dtype` argument is used to specifiy the bit-width
         inplace: carry out model transformations in-place, the original module is mutated
         mapping: maps type of a submodule to a type of corresponding dynamically quantized version
             with which the submodule needs to be replaced
     """
-    if qconfig_dict is None:
+    if qconfig is None:
         if dtype == torch.qint8:
-            qconfig_dict = {
+            qconfig = {
                 nn.Linear : default_dynamic_qconfig,
                 nn.LSTM : default_dynamic_qconfig,
             }
         elif dtype == torch.float16:
-            qconfig_dict = {
+            qconfig = {
                 # TODO: uncomment when float16 Linear support is added
                 # nn.Linear : default_dynamic_qconfig,
                 nn.LSTM : float16_dynamic_qconfig,
@@ -307,11 +311,19 @@ def quantize_dynamic(model, qconfig_dict=None, dtype=torch.qint8, mapping=DEFAUL
         else:
             raise ValueError(
                 "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype))
+    elif isinstance(qconfig, set):
+        if dtype is torch.qint8:
+            default_qconfig = default_dynamic_qconfig
+        elif dtype is torch.float16:
+            default_qconfig = float16_dynamic_qconfig
+        else:
+            raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
+        qconfig = dict(zip(qconfig, itertools.repeat(default_qconfig)))
 
     if not inplace:
         model = copy.deepcopy(model)
     model.eval()
-    propagate_qconfig_(model, qconfig_dict)
+    propagate_qconfig_(model, qconfig)
     convert(model, mapping, inplace=True)
     return model
 

From a423817055435876631e4c9ce5de11121a98cd98 Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Wed, 2 Oct 2019 19:53:40 -0700
Subject: [PATCH 006/113] Fix reprs for _intrinsic modules

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27184

Test Plan: Imported from OSS

Differential Revision: D17717481

Pulled By: jamesr66a

fbshipit-source-id: 4bd72bcd42191d9b21d03f5bb6698198dbffffda
---
 test/test_quantized_nn_mods.py                   |  7 +++++++
 .../nn/intrinsic/quantized/modules/conv_relu.py  |  3 +++
 .../intrinsic/quantized/modules/linear_relu.py   |  3 +++
 torch/quantization/quantize.py                   | 16 ++++++++--------
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/test/test_quantized_nn_mods.py b/test/test_quantized_nn_mods.py
index afa3f423a6421..37756a597a834 100644
--- a/test/test_quantized_nn_mods.py
+++ b/test/test_quantized_nn_mods.py
@@ -272,8 +272,12 @@ def test_linear_api(self, batch_size, in_features, out_features, use_bias, use_f
             # ops directly
             if use_fused:
                 Z_ref = torch.ops.quantized.linear_relu(X_q, W_pack, scale, zero_point)
+
+                self.assertTrue('QuantizedLinearReLU' in str(qlinear))
             else:
                 Z_ref = torch.ops.quantized.linear(X_q, W_pack, scale, zero_point)
+
+                self.assertTrue('QuantizedLinear' in str(qlinear))
             self.assertEqual(Z_ref, Z_q)
 
             # Test serialization of quantized Linear Module using state_dict
@@ -483,6 +487,8 @@ def test_conv_api(self, use_bias, use_fused, per_channel, qengine):
                                                     groups=g,
                                                     bias=use_bias,
                                                     padding_mode='zeros')
+
+                self.assertTrue('QuantizedConvReLU2d' in str(loaded_conv_under_test))
             else:
                 loaded_conv_under_test = Conv2d(in_channels=iC,
                                                 out_channels=oC,
@@ -493,6 +499,7 @@ def test_conv_api(self, use_bias, use_fused, per_channel, qengine):
                                                 groups=g,
                                                 bias=use_bias,
                                                 padding_mode='zeros')
+                self.assertTrue('QuantizedConv2d' in str(loaded_conv_under_test))
             loaded_conv_under_test.load_state_dict(loaded_dict)
             self.assertEqual(loaded_conv_under_test._weight_bias(), conv_under_test._weight_bias())
             if use_bias:
diff --git a/torch/nn/intrinsic/quantized/modules/conv_relu.py b/torch/nn/intrinsic/quantized/modules/conv_relu.py
index 1a5d70d8a6dbb..4143a22613c9d 100644
--- a/torch/nn/intrinsic/quantized/modules/conv_relu.py
+++ b/torch/nn/intrinsic/quantized/modules/conv_relu.py
@@ -35,6 +35,9 @@ def forward(self, input):
                                                self.dilation, self.groups,
                                                self.scale, self.zero_point)
 
+    def _get_name(self):
+        return 'QuantizedConvReLU2d'
+
     @classmethod
     def from_float(cls, mod):
         if type(mod) == torch.nn.intrinsic.qat.ConvBnReLU2d:
diff --git a/torch/nn/intrinsic/quantized/modules/linear_relu.py b/torch/nn/intrinsic/quantized/modules/linear_relu.py
index 5faf8347c3fca..28bb9d8c1a785 100644
--- a/torch/nn/intrinsic/quantized/modules/linear_relu.py
+++ b/torch/nn/intrinsic/quantized/modules/linear_relu.py
@@ -32,6 +32,9 @@ def forward(self, input):
             int(self.zero_point))
         return Y_q
 
+    def _get_name(self):
+        return 'QuantizedLinearReLU'
+
     @classmethod
     def from_float(cls, mod):
         return super(LinearReLU, cls).from_float(mod)
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index f38187f6e1975..85403b71a3063 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -271,7 +271,7 @@ def quantize(model, run_fn, run_args, mapping=DEFAULT_MODULE_MAPPING, inplace=Fa
     convert(model, mapping, inplace=True)
     return model
 
-def quantize_dynamic(model, qconfig=None, dtype=torch.qint8, mapping=DEFAULT_DYNAMIC_MODULE_MAPPING, inplace=False):
+def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, mapping=DEFAULT_DYNAMIC_MODULE_MAPPING, inplace=False):
     r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
 
     Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
@@ -284,7 +284,7 @@ def quantize_dynamic(model, qconfig=None, dtype=torch.qint8, mapping=DEFAULT_DYN
 
     Args:
         module: input model
-        qconfig: Either:
+        qconfig_spec: Either:
             * A dictionary that maps from name or type of submodule to quantization
               configuration, qconfig applies to all submodules of a given
               module unless qconfig for the submodules are specified (when the
@@ -296,14 +296,14 @@ def quantize_dynamic(model, qconfig=None, dtype=torch.qint8, mapping=DEFAULT_DYN
         mapping: maps type of a submodule to a type of corresponding dynamically quantized version
             with which the submodule needs to be replaced
     """
-    if qconfig is None:
+    if qconfig_spec is None:
         if dtype == torch.qint8:
-            qconfig = {
+            qconfig_spec = {
                 nn.Linear : default_dynamic_qconfig,
                 nn.LSTM : default_dynamic_qconfig,
             }
         elif dtype == torch.float16:
-            qconfig = {
+            qconfig_spec = {
                 # TODO: uncomment when float16 Linear support is added
                 # nn.Linear : default_dynamic_qconfig,
                 nn.LSTM : float16_dynamic_qconfig,
@@ -311,19 +311,19 @@ def quantize_dynamic(model, qconfig=None, dtype=torch.qint8, mapping=DEFAULT_DYN
         else:
             raise ValueError(
                 "Don't know how to quantize with default settings for {}. Provide full qconfig please".format(dtype))
-    elif isinstance(qconfig, set):
+    elif isinstance(qconfig_spec, set):
         if dtype is torch.qint8:
             default_qconfig = default_dynamic_qconfig
         elif dtype is torch.float16:
             default_qconfig = float16_dynamic_qconfig
         else:
             raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
-        qconfig = dict(zip(qconfig, itertools.repeat(default_qconfig)))
+        qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
 
     if not inplace:
         model = copy.deepcopy(model)
     model.eval()
-    propagate_qconfig_(model, qconfig)
+    propagate_qconfig_(model, qconfig_spec)
     convert(model, mapping, inplace=True)
     return model
 

From fc249c79245af10eeb564ab0f1d42b025a8e07f4 Mon Sep 17 00:00:00 2001
From: Yanli Zhao <yanlizhao@fb.com>
Date: Wed, 2 Oct 2019 23:03:58 -0700
Subject: [PATCH 007/113] skip all rpc and dist autograd spawn tests for <PY36
 (#27191)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27191

skip rpc and distautograd spawns tests for <python 3.6
ghstack-source-id: 91231565

close #27157

Test Plan: unit tests

Differential Revision: D17697368

fbshipit-source-id: bb8cf1f47de41f9d350fd60afe37fece293d8680
---
 test/run_test.py                 | 10 ++++++++--
 test/test_dist_autograd_spawn.py |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/test/run_test.py b/test/run_test.py
index f32e09c65c0b5..7788ec4e7a82c 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -28,7 +28,6 @@
     'cuda_primary_ctx',
     'dataloader',
     'dist_autograd_fork',
-    'dist_autograd_spawn',
     'distributed',
     'distributions',
     'docs_coverage',
@@ -64,12 +63,19 @@
     'function_schema',
 ]
 
-# skip < 3.6 b/c fstrings added in 3.6
+# skip < 3.6 b/c fstrings added in 3.6 for jit_py3
+# skip < 3.6 for rpc_spawn and dist_autograd_spawn temporarily because
+# a segmenation fault was triggered on python 3.5,
+# rpc_spawn and dist_autograd_spawn tests were added in
+# https://github.com/pytorch/pytorch/pull/25656
+# skip < 3.6 for rpc_fork as it imports mock that is only available in 3.6, mock
+# was added to rpc_fork in https://github.com/pytorch/pytorch/pull/26997
 if PY36:
     TESTS.extend([
         'jit_py3',
         'rpc_fork',
         'rpc_spawn',
+        'dist_autograd_spawn',
     ])
 
 WINDOWS_BLACKLIST = [
diff --git a/test/test_dist_autograd_spawn.py b/test/test_dist_autograd_spawn.py
index 409e1ac08ba7a..e8d8af119aaf0 100644
--- a/test/test_dist_autograd_spawn.py
+++ b/test/test_dist_autograd_spawn.py
@@ -3,11 +3,11 @@
 
 from dist_autograd_test import DistAutogradTest
 from common_distributed import MultiProcessTestCase
-from common_utils import run_tests
+from common_utils import TEST_WITH_ASAN, run_tests
 
 import unittest
 
-@unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/27157")
+@unittest.skipIf(TEST_WITH_ASAN, "Skip ASAN as torch + multiprocessing spawn have known issues")
 class DistAutogradTestWithSpawn(MultiProcessTestCase, DistAutogradTest):
 
     def setUp(self):

From fe4170bda8f6b497c0f9ccebbaab6c2130d9c0aa Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Thu, 3 Oct 2019 01:05:18 -0700
Subject: [PATCH 008/113] Add send and recv backward functions for builtin
 operators RPC. (#25527)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/25527

Master GH issue: https://github.com/pytorch/pytorch/issues/23110.

This change builds upon https://github.com/pytorch/pytorch/pull/24876 and
provides all the autograd hooks needed for a forward pass with distributed rpc
for builtin operators. This change does not address distributed rpc for python
UDFs and that will be addressed in follow up PRs.

Summary of changes:
1. Attach send autograd functions when a request is sent from the client and
response is sent from the server.
2. Attach receive autograd functions when a request is received on the server
and a response is received on the client.
3. Generate a globally unique autograd_message_id for each send/recv autograd
function pair to uniquely identify them.
ghstack-source-id: 91240466

Test Plan: unit tests.

Differential Revision: D17148077

fbshipit-source-id: 192d8a3f552ed7cc939f55dcca332965c9bd3233
---
 caffe2/CMakeLists.txt                         |  11 +-
 test/cpp/dist_autograd/test_dist_autograd.cpp |  34 +++-
 test/dist_autograd_test.py                    |  77 ++++++---
 test/dist_utils.py                            |  36 ++++
 test/rpc_test.py                              |  94 ++++------
 tools/build_variables.py                      |  17 +-
 torch/CMakeLists.txt                          |   5 +-
 torch/csrc/THP.h                              |   8 +-
 torch/csrc/byte_order.h                       |  31 ----
 .../context/dist_autograd_container.cpp       | 102 ++++++++---
 .../context/dist_autograd_container.h         |  33 +++-
 .../context/dist_autograd_context.cpp         |  33 +++-
 .../autograd/context/dist_autograd_context.h  |  30 +++-
 .../autograd/functions/recvrpc_backward.cpp   |  26 +++
 .../autograd/functions/recvrpc_backward.h     |  21 +++
 torch/csrc/distributed/autograd/init.cpp      |  27 ++-
 torch/csrc/distributed/autograd/utils.cpp     |  41 ++++-
 torch/csrc/distributed/autograd/utils.h       |  28 ++-
 torch/csrc/distributed/rpc/functions.cpp      | 118 -------------
 torch/csrc/distributed/rpc/functions.h        |  16 --
 torch/csrc/distributed/rpc/future_message.cpp |  16 +-
 torch/csrc/distributed/rpc/future_message.h   |   5 +-
 torch/csrc/distributed/rpc/init.cpp           |   1 -
 torch/csrc/distributed/rpc/message.cpp        |  13 +-
 torch/csrc/distributed/rpc/message.h          |  10 +-
 .../distributed/rpc/process_group_agent.cpp   |  18 +-
 .../distributed/rpc/process_group_agent.h     |   3 +-
 .../csrc/distributed/rpc/python_functions.cpp |  68 ++++++--
 torch/csrc/distributed/rpc/python_functions.h |   2 +-
 .../distributed/rpc/python_rpc_handler.cpp    |  19 +-
 .../csrc/distributed/rpc/python_rpc_handler.h |   9 +-
 .../csrc/distributed/rpc/python_udf_call.cpp  |  37 ++++
 torch/csrc/distributed/rpc/python_udf_call.h  |  31 ++++
 .../csrc/distributed/rpc/python_udf_resp.cpp  |  35 ++++
 torch/csrc/distributed/rpc/python_udf_resp.h  |  31 ++++
 .../csrc/distributed/rpc/request_callback.cpp |  35 ++++
 torch/csrc/distributed/rpc/request_callback.h |  32 ++++
 .../distributed/rpc/request_callback_impl.cpp | 153 ++++++++++++++++
 .../distributed/rpc/request_callback_impl.h   |  23 +++
 torch/csrc/distributed/rpc/rpc_agent.cpp      |  23 +--
 torch/csrc/distributed/rpc/rpc_agent.h        |  27 +--
 torch/csrc/distributed/rpc/rpc_command_base.h |  22 +++
 .../distributed/rpc/rpc_with_autograd.cpp     | 163 ++++++++++++++++++
 .../csrc/distributed/rpc/rpc_with_autograd.h  |  75 ++++++++
 torch/csrc/distributed/rpc/script_call.cpp    |   6 +-
 torch/csrc/distributed/rpc/script_call.h      |   7 +-
 .../distributed/rpc/script_remote_call.cpp    |   8 +-
 .../csrc/distributed/rpc/script_remote_call.h |   4 +-
 .../rpc/{script_ret.cpp => script_resp.cpp}   |  13 +-
 .../rpc/{script_ret.h => script_resp.h}       |   9 +-
 .../distributed/rpc/script_rref_proto.cpp     |  21 ++-
 .../csrc/distributed/rpc/script_rref_proto.h  |  12 +-
 torch/csrc/distributed/rpc/utils.cpp          |  68 ++++++++
 torch/csrc/distributed/rpc/utils.h            |  21 +++
 torch/csrc/generic/StorageMethods.cpp         |  32 ++--
 torch/csrc/generic/serialization.cpp          |  53 ++++--
 torch/csrc/{ => utils}/byte_order.cpp         |  24 ++-
 torch/csrc/utils/byte_order.h                 |  87 ++++++++++
 58 files changed, 1508 insertions(+), 496 deletions(-)
 create mode 100644 test/dist_utils.py
 delete mode 100644 torch/csrc/byte_order.h
 create mode 100644 torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
 create mode 100644 torch/csrc/distributed/autograd/functions/recvrpc_backward.h
 delete mode 100644 torch/csrc/distributed/rpc/functions.cpp
 delete mode 100644 torch/csrc/distributed/rpc/functions.h
 create mode 100644 torch/csrc/distributed/rpc/python_udf_call.cpp
 create mode 100644 torch/csrc/distributed/rpc/python_udf_call.h
 create mode 100644 torch/csrc/distributed/rpc/python_udf_resp.cpp
 create mode 100644 torch/csrc/distributed/rpc/python_udf_resp.h
 create mode 100644 torch/csrc/distributed/rpc/request_callback.cpp
 create mode 100644 torch/csrc/distributed/rpc/request_callback.h
 create mode 100644 torch/csrc/distributed/rpc/request_callback_impl.cpp
 create mode 100644 torch/csrc/distributed/rpc/request_callback_impl.h
 create mode 100644 torch/csrc/distributed/rpc/rpc_command_base.h
 create mode 100644 torch/csrc/distributed/rpc/rpc_with_autograd.cpp
 create mode 100644 torch/csrc/distributed/rpc/rpc_with_autograd.h
 rename torch/csrc/distributed/rpc/{script_ret.cpp => script_resp.cpp} (66%)
 rename torch/csrc/distributed/rpc/{script_ret.h => script_resp.h} (57%)
 create mode 100644 torch/csrc/distributed/rpc/utils.cpp
 create mode 100644 torch/csrc/distributed/rpc/utils.h
 rename torch/csrc/{ => utils}/byte_order.cpp (91%)
 create mode 100644 torch/csrc/utils/byte_order.h

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index e16306765f467..1f40502b7146e 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -481,18 +481,27 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
   if (NOT INTERN_BUILD_MOBILE)
     list(APPEND TORCH_SRCS
       ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/autograd/context/dist_autograd_container.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/autograd/context/dist_autograd_context.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/autograd/functions/recvrpc_backward.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/autograd/functions/sendrpc_backward.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/autograd/utils.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/future_message.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/message.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_udf_call.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_udf_resp.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/request_callback.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/rpc_with_autograd.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_call.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_remote_call.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_rref_proto.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_ret.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_resp.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/utils.cpp
       ${TORCH_SRC_DIR}/csrc/jit/export.cpp
       ${TORCH_SRC_DIR}/csrc/jit/import_legacy.cpp
       ${TORCH_SRC_DIR}/csrc/jit/netdef_converter.cpp
       ${TORCH_SRC_DIR}/csrc/jit/fuser/cpu/fused_kernel.cpp
+      ${TORCH_SRC_DIR}/csrc/utils/byte_order.cpp
     )
   endif()
 
diff --git a/test/cpp/dist_autograd/test_dist_autograd.cpp b/test/cpp/dist_autograd/test_dist_autograd.cpp
index 91bdbad3456f9..7af16e6a34628 100644
--- a/test/cpp/dist_autograd/test_dist_autograd.cpp
+++ b/test/cpp/dist_autograd/test_dist_autograd.cpp
@@ -1,10 +1,26 @@
 #include <gtest/gtest.h>
 
 #include <ATen/ATen.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_container.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_context.h>
 #include <torch/csrc/distributed/autograd/utils.h>
+#include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
 #include <torch/torch.h>
 
-TEST(DistAutogradTest, TestSendFunction) {
+using namespace torch::distributed::autograd;
+using namespace torch::distributed::rpc;
+
+class DistAutogradTest : public ::testing::Test {
+ protected:
+  static void SetUpTestCase() {
+    autogradContainer_ = &DistAutogradContainer::init(0);
+  }
+  static DistAutogradContainer* autogradContainer_;
+};
+
+DistAutogradContainer* DistAutogradTest::autogradContainer_ = nullptr;
+
+TEST_F(DistAutogradTest, TestSendFunction) {
   // Initialize input tensors requiring grad.
   auto options = at::TensorOptions().requires_grad(true);
   auto in1 = torch::ones({3, 3}, options);
@@ -12,9 +28,12 @@ TEST(DistAutogradTest, TestSendFunction) {
   ASSERT_FALSE(in1.grad().defined());
   ASSERT_FALSE(in2.grad().defined());
 
+  autogradContainer_->newContext();
+  DistAutogradContext& autogradContext = autogradContainer_->currentContext();
   // Attach the send autograd function to tensors.
-  auto send_function =
-      torch::distributed::autograd::addSendRpcBackward({in1, in2});
+  std::vector<torch::Tensor> tensors = {in1, in2};
+  addSendRpcBackward(autogradContext, AutogradMetadata(1, 1), tensors);
+  auto send_function = autogradContext.sendFunctions()[1];
   ASSERT_NE(send_function, nullptr);
 
   // Build loss and attach it as input to send autograd function.
@@ -33,14 +52,17 @@ TEST(DistAutogradTest, TestSendFunction) {
   ASSERT_TRUE(in2.grad().defined());
 }
 
-TEST(DistAutogradTest, TestSendFunctionInvalidInputs) {
+TEST_F(DistAutogradTest, TestSendFunctionInvalidInputs) {
   auto options = at::TensorOptions().requires_grad(true);
   auto in1 = torch::ones({3, 3}, options);
   auto in2 = torch::ones({3, 3}, options);
 
+  autogradContainer_->newContext();
+  DistAutogradContext& autogradContext = autogradContainer_->currentContext();
   // Attach the send autograd function to tensors.
-  auto send_function =
-      torch::distributed::autograd::addSendRpcBackward({in1, in2});
+  std::vector<torch::Tensor> tensors = {in1, in2};
+  addSendRpcBackward(autogradContext, AutogradMetadata(1, 1), tensors);
+  auto send_function = autogradContext.sendFunctions()[1];
 
   // Build loss and attach it as input to send autograd function.
   auto loss = torch::autograd::Variable(torch::ones({3, 3}));
diff --git a/test/dist_autograd_test.py b/test/dist_autograd_test.py
index 6305cd526b841..e89a28f4c51c4 100644
--- a/test/dist_autograd_test.py
+++ b/test/dist_autograd_test.py
@@ -1,35 +1,20 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import sys
 import torch.distributed as dist
 import torch.distributed.autograd as dist_autograd
-from functools import wraps
+from dist_utils import dist_init
 import six
 import unittest
 import torch
+import time
 
-if not dist.is_available():
-    print("c10d not available, skipping tests")
-    sys.exit(0)
-
-def dist_init(func):
-    """
-    We use this decorator for setting up and tearing down state since
-    MultiProcessTestCase runs each `test*` method in a separate process and
-    each process just runs the `test*` method without actually calling
-    'setUp' and 'tearDown' methods of unittest.
-    """
-    @wraps(func)
-    def wrapper(self):
-        self.worker_id = self.rank
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(backend='gloo', rank=self.rank,
-                                world_size=self.world_size, store=store)
-        dist.init_model_parallel('worker%d' % self.rank)
-        func(self)
-        dist.join_rpc()
-
-    return wrapper
+prev_rank_rpc_done = False
+prev_rank_context_id = 0
+def _set_rpc_done(context_id):
+    global prev_rank_rpc_done
+    global prev_rank_context_id
+    prev_rank_rpc_done = True
+    prev_rank_context_id = context_id
 
 @unittest.skipIf(not six.PY3, "Pytorch distributed autograd package "
                  "does not support python2")
@@ -41,6 +26,10 @@ def world_size(self):
 
     @dist_init
     def test_autograd_context(self):
+        # Verify max possible id.
+        max_auto_increment = 281474976710655
+        self.assertEqual(max_auto_increment + (self.worker_id << 48), dist_autograd._get_max_id())
+
         context_ids = []
         for i in range(1000):
             with dist_autograd.context() as context_id:
@@ -54,12 +43,13 @@ def test_autograd_context(self):
                 dist_autograd._retrieve_context(context_id)
 
     @dist_init
-    def test_autograd_send_function(self):
+    def test_autograd_functions(self):
         dst_rank = (self.rank + 1) % self.world_size
         with dist_autograd.context() as context_id:
             t1 = torch.ones(3, 3, requires_grad=True)
             t2 = torch.zeros(3, 3, requires_grad=True)
             ret = dist.rpc_sync('worker{}'.format(dst_rank), torch.add, args=(t1, t2))
+            dist.rpc_sync('worker{}'.format(dst_rank), _set_rpc_done, args=(context_id,))
 
             # Get send function.
             ctx = dist_autograd._current_context()
@@ -68,7 +58,7 @@ def test_autograd_send_function(self):
             self.assertEqual(1, len(send_functions))
 
             # Retrieve the next functions in the graph.
-            next_funcs = send_functions[0].next_functions
+            next_funcs = list(send_functions.values())[0].next_functions
             self.assertEqual(2, len(next_funcs))
 
             # We should now hit t1 and t2 in the autograd graph.
@@ -79,6 +69,39 @@ def test_autograd_send_function(self):
             self.assertEqual(t2, next_funcs[1][0].variable)
             self.assertEqual(0, next_funcs[1][1])
 
+            # Test recv functions.
+            recv_functions = ctx._recv_functions()
+            self.assertEqual(1, len(recv_functions))
+            self.assertEqual(ret.grad_fn, list(recv_functions.values())[0])
+
+            # We should have send/recv functions from the previous rank, get all
+            # contexts in this node to find them.
+
+            # Wait for the prev rank to be done with rpc.
+            while not prev_rank_rpc_done:
+                time.sleep(0.1)
+                pass
+
+            # Now verify the autograd graph.
+            ctx = dist_autograd._retrieve_context(prev_rank_context_id)
+
+            # Get the send function.
+            send_functions = ctx._send_functions()
+            self.assertEqual(1, len(send_functions))
+
+            # Verify next function is AddBackward0
+            next_funcs = list(send_functions.values())[0].next_functions
+            self.assertEqual(1, len(next_funcs))
+            add_backward_fn = next_funcs[0][0]
+            self.assertEqual('AddBackward0', add_backward_fn.name())
+
+            # Verify the next two functions are the same recv backward function.
+            next_funcs = add_backward_fn.next_functions
+            self.assertEqual(2, len(next_funcs))
+            self.assertEqual('torch::distributed::autograd::RecvRpcBackward', next_funcs[0][0].name())
+            self.assertEqual('torch::distributed::autograd::RecvRpcBackward', next_funcs[1][0].name())
+            self.assertEqual(next_funcs[0][0], next_funcs[1][0])
+
         # autograd context should be cleaned up by now.
         with self.assertRaises(RuntimeError):
             ctx = dist_autograd._retrieve_context(context_id)
@@ -99,7 +122,7 @@ def test_rpc_complex_args(self):
             self.assertEqual(torch.stack(tensors), ret)
 
             # Verify appropriate tensors have been attached the autograd graph.
-            next_funcs = dist_autograd._current_context()._send_functions()[0].next_functions
+            next_funcs = list(dist_autograd._current_context()._send_functions().values())[0].next_functions
             idx = 0
             for i in range(num_tensors):
                 if i % 2 == 0:
diff --git a/test/dist_utils.py b/test/dist_utils.py
new file mode 100644
index 0000000000000..c37c014f365fa
--- /dev/null
+++ b/test/dist_utils.py
@@ -0,0 +1,36 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+from os import getenv
+from functools import wraps
+import torch.distributed as dist
+from torch.distributed.rpc_api import RpcBackend
+
+if not dist.is_available():
+    print("c10d not available, skipping tests")
+    sys.exit(0)
+
+
+BACKEND = getenv('RPC_BACKEND', RpcBackend.PROCESS_GROUP)
+RPC_INIT_URL = getenv('RPC_INIT_URL', '')
+
+def dist_init(func):
+    """
+    We use this decorator for setting up and tearing down state since
+    MultiProcessTestCase runs each `test*` method in a separate process and
+    each process just runs the `test*` method without actually calling
+    'setUp' and 'tearDown' methods of unittest.
+    """
+    @wraps(func)
+    def wrapper(self):
+        self.worker_id = self.rank
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(backend='gloo', rank=self.rank,
+                                world_size=self.world_size, store=store)
+        dist.init_model_parallel(self_name='worker%d' % self.rank,
+                                 backend=BACKEND,
+                                 self_rank=self.rank,
+                                 init_method=RPC_INIT_URL)
+        func(self)
+        dist.join_rpc()
+
+    return wrapper
diff --git a/test/rpc_test.py b/test/rpc_test.py
index ba02aff28616e..2b6f6414c4344 100644
--- a/test/rpc_test.py
+++ b/test/rpc_test.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python3
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import functools
 import sys
 import unittest
 from os import getenv
@@ -13,14 +12,9 @@
 from collections import namedtuple
 from torch.distributed.internal_rpc_utils import _internal_rpc_pickler, PythonUDF
 
-
-if not dist.is_available():
-    print("c10d not available, skipping tests")
-    sys.exit(0)
-
 from common_utils import load_tests
 from torch.distributed.rpc_api import RpcBackend
-
+from dist_utils import dist_init
 
 BACKEND = getenv("RPC_BACKEND", RpcBackend.PROCESS_GROUP)
 RPC_INIT_URL = getenv("RPC_INIT_URL", "")
@@ -148,32 +142,6 @@ def raise_func():
 load_tests = load_tests
 
 
-def _wrap_with_rpc(test_method):
-    """
-        We use this decorator for setting up and tearing down state since
-        MultiProcessTestCase runs each `test*` method in a separate process and
-        each process just runs the `test*` method without actually calling
-        'setUp' and 'tearDown' methods of unittest.
-    """
-
-    @functools.wraps(test_method)
-    def wrapper(self, *arg, **kwargs):
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(
-            backend="gloo", rank=self.rank, world_size=self.world_size, store=store
-        )
-        dist.init_model_parallel(
-            self_name="worker%d" % self.rank,
-            backend=BACKEND,
-            self_rank=self.rank,
-            init_method=RPC_INIT_URL,
-        )
-        test_method(self, *arg, **kwargs)
-        dist.join_rpc()
-
-    return wrapper
-
-
 @unittest.skipIf(
     sys.version_info < (3, 0),
     "Pytorch distributed rpc package " "does not support python2",
@@ -183,7 +151,7 @@ class RpcTest(object):
     def world_size(self):
         return 4
 
-    @_wrap_with_rpc
+    @dist_init
     def test_worker_id(self):
         n = self.rank + 1
         peer_rank = n % self.world_size
@@ -196,7 +164,7 @@ def test_worker_id(self):
         with self.assertRaisesRegex(RuntimeError, "Unknown destination worker"):
             unknown_worker_id = dist.get_worker_id("WorkerUnknown")
 
-    @_wrap_with_rpc
+    @dist_init
     def test_self_add(self):
         self_worker_id = dist.get_worker_id()
         self_worker_name = "worker{}".format(self.rank)
@@ -296,7 +264,7 @@ def test_invalid_names(self):
             )
         dist.join_rpc()
 
-    @_wrap_with_rpc
+    @dist_init
     def test_add(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -307,7 +275,7 @@ def test_add(self):
         )
         self.assertEqual(ret, torch.ones(n, n) * 2)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_add_with_id(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -318,7 +286,7 @@ def test_add_with_id(self):
         )
         self.assertEqual(ret, torch.ones(n, n) * 2)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_scalar_add(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -327,7 +295,7 @@ def test_scalar_add(self):
         )
         self.assertEqual(ret, (torch.ones(n, n) + n))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_async_add(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -338,7 +306,7 @@ def test_async_add(self):
         )
         self.assertEqual(fut.wait(), torch.ones(n, n) * 2)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_nonzero(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -347,7 +315,7 @@ def test_nonzero(self):
         ret = dist.rpc_sync("worker{}".format(dst_rank), torch.nonzero, args=(x,))
         self.assertEqual(ret, x.nonzero())
 
-    @_wrap_with_rpc
+    @dist_init
     def test_multi_rpc(self):
         dst_rank = (self.rank + 1) % self.world_size
         for i in range(20):
@@ -359,7 +327,7 @@ def test_multi_rpc(self):
             )
             self.assertEqual(ret, torch.ones(n, n) * 2)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_sync_rpc(self):
         dst_rank = (self.rank + 1) % self.world_size
         for i in range(20):
@@ -378,7 +346,7 @@ def test_sync_rpc(self):
             self.assertEqual(ret1, torch.ones(n, n) * 2)
             self.assertEqual(ret2, torch.ones(n, n) * 3)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_join_rpc(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -400,14 +368,14 @@ def test_join_rpc(self):
         # it's safe to call join_rpc() multiple times
         dist.join_rpc()
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_built_in(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         ret = dist.rpc_sync("worker{}".format(dst_rank), min, args=(n, n + 1, n + 2))
         self.assertEqual(ret, min(n, n + 1, n + 2))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_user_defined(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -418,14 +386,14 @@ def test_py_user_defined(self):
         )
         self.assertEqual(ret, my_function(n, n + 1, n + 2))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_class_constructor(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         ret = dist.rpc_sync("worker{}".format(dst_rank), MyClass, args=(n,))
         self.assertEqual(ret.a, n)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_class_instance_method(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -434,7 +402,7 @@ def test_py_class_instance_method(self):
         )
         self.assertEqual(ret, MyClass(2).my_instance_method(n))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_class_method(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -443,7 +411,7 @@ def test_py_class_method(self):
         )
         self.assertEqual(ret, MyClass.my_class_method(n, n + 1))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_class_static_method(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -452,7 +420,7 @@ def test_py_class_static_method(self):
         )
         self.assertEqual(ret, MyClass.my_static_method(n + 10))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_multi_async_call(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -462,14 +430,14 @@ def test_py_multi_async_call(self):
         self.assertEqual(fut1.wait(), MyClass.my_static_method(n + 10))
         self.assertEqual(fut2.wait(), min(n, n + 1, n + 2))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_no_return_result(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         ret = dist.rpc_sync("worker{}".format(dst_rank), no_result)
         self.assertEqual(ret, no_result())
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_tensors(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -480,7 +448,7 @@ def test_py_tensors(self):
                          my_tensor_function(torch.ones(n, n),
                                             torch.ones(n, n)))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_tensors_multi_async_call(self):
         futs = []
         n = self.rank + 1
@@ -499,7 +467,7 @@ def test_py_tensors_multi_async_call(self):
                                                 torch.ones(j, j)))
             j += 1
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_tensors_in_container(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -511,7 +479,7 @@ def test_py_tensors_in_container(self):
                        args=(a, b, c))
         self.assertEqual(ret, my_complex_tensor_function(a, b, c))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_nested_pickle(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -524,14 +492,14 @@ def test_py_nested_pickle(self):
         m.set(my_tensor_function(torch.ones(2, 2), torch.ones(2, 2)))
         self.assertEqual(ret, run_nested_pickle(m, torch.ones(2, 2)))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_function_exception(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         with self.assertRaisesRegex(Exception, "TypeError"):
             ret = dist.rpc_sync("worker{}".format(dst_rank), no_result, args=(10,))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_py_raise_in_user_func(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -539,7 +507,7 @@ def test_py_raise_in_user_func(self):
         with self.assertRaisesRegex(Exception, "ValueError"):
             fut.wait()
 
-    @_wrap_with_rpc
+    @dist_init
     def test_nested_rpc(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -570,15 +538,15 @@ def _stress_test_rpc(self, f, repeat=1000, args=()):
             )
         )
 
-    @_wrap_with_rpc
+    @dist_init
     def test_stress_light_rpc(self):
         self._stress_test_rpc(light_rpc)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_stress_heavy_rpc(self):
         self._stress_test_rpc(heavy_rpc, repeat=20, args=(torch.ones(100, 100),))
 
-    @_wrap_with_rpc
+    @dist_init
     def test_builtin_remote_ret(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -589,7 +557,7 @@ def test_builtin_remote_ret(self):
         )
         self.assertEqual(rref.to_here(), torch.ones(n, n) * 2)
 
-    @_wrap_with_rpc
+    @dist_init
     def test_multi_builtin_remote_ret(self):
         m = 10
         n = self.rank + 1
@@ -610,7 +578,7 @@ def test_multi_builtin_remote_ret(self):
         for i in range(m):
             self.assertEqual(rrefs[i].to_here(), expected[i])
 
-    @_wrap_with_rpc
+    @dist_init
     def test_multi_layer_nested_async_rpc(self):
         # This test will exit right away, but there will be a chain of async
         # RPCs. The termination algorithm should detect those messages properly.
diff --git a/tools/build_variables.py b/tools/build_variables.py
index 88ffafe3a8e12..b68094e554f1c 100644
--- a/tools/build_variables.py
+++ b/tools/build_variables.py
@@ -52,13 +52,19 @@
     "torch/csrc/distributed/autograd/utils.cpp",
     "torch/csrc/distributed/autograd/context/dist_autograd_container.cpp",
     "torch/csrc/distributed/autograd/context/dist_autograd_context.cpp",
+    "torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp",
     "torch/csrc/distributed/autograd/functions/sendrpc_backward.cpp",
     "torch/csrc/distributed/rpc/future_message.cpp",
     "torch/csrc/distributed/rpc/message.cpp",
+    "torch/csrc/distributed/rpc/python_udf_call.cpp",
+    "torch/csrc/distributed/rpc/python_udf_resp.cpp",
+    "torch/csrc/distributed/rpc/request_callback.cpp",
+    "torch/csrc/distributed/rpc/rpc_with_autograd.cpp",
     "torch/csrc/distributed/rpc/script_call.cpp",
     "torch/csrc/distributed/rpc/script_remote_call.cpp",
     "torch/csrc/distributed/rpc/script_rref_proto.cpp",
-    "torch/csrc/distributed/rpc/script_ret.cpp",
+    "torch/csrc/distributed/rpc/script_resp.cpp",
+    "torch/csrc/distributed/rpc/utils.cpp",
     "torch/csrc/Exceptions.cpp",
     "torch/csrc/jit/autodiff.cpp",
     "torch/csrc/jit/attributes.cpp",
@@ -146,8 +152,6 @@
     "torch/csrc/jit/script/builtin_functions.cpp",
     "torch/csrc/jit/script/module.cpp",
     "torch/csrc/jit/tracer.cpp",
-    "torch/csrc/utils/tensor_flatten.cpp",
-    "torch/csrc/utils/variadic.cpp",
     "torch/csrc/jit/fuser/kernel_cache.cpp",
     "torch/csrc/jit/fuser/compiler.cpp",
     "torch/csrc/jit/fuser/executor.cpp",
@@ -162,6 +166,9 @@
     "torch/csrc/jit/mobile/module.cpp",
     "torch/csrc/jit/mobile/register_mobile_ops.cpp",
     "torch/csrc/jit/mobile/interpreter.cpp",
+    "torch/csrc/utils/byte_order.cpp",
+    "torch/csrc/utils/tensor_flatten.cpp",
+    "torch/csrc/utils/variadic.cpp",
 ]
 
 libtorch_cuda_sources = [
@@ -256,17 +263,15 @@ def add_torch_libs():
         "torch/csrc/autograd/python_legacy_variable.cpp",
         "torch/csrc/autograd/python_variable.cpp",
         "torch/csrc/autograd/python_variable_indexing.cpp",
-        "torch/csrc/byte_order.cpp",
         "torch/csrc/distributed/autograd/init.cpp",
         "torch/csrc/distributed/c10d/comm.cpp",
         "torch/csrc/distributed/c10d/init.cpp",
         "torch/csrc/distributed/c10d/reducer.cpp",
-        "torch/csrc/distributed/autograd/init.cpp",
-        "torch/csrc/distributed/rpc/functions.cpp",
         "torch/csrc/distributed/rpc/init.cpp",
         "torch/csrc/distributed/rpc/process_group_agent.cpp",
         "torch/csrc/distributed/rpc/python_functions.cpp",
         "torch/csrc/distributed/rpc/python_rpc_handler.cpp",
+        "torch/csrc/distributed/rpc/request_callback_impl.cpp",
         "torch/csrc/distributed/rpc/rpc_agent.cpp",
         "torch/csrc/distributed/rpc/rref.cpp",
         "torch/csrc/distributed/rpc/rref_context.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 59002e7203df6..19eeecd0bb739 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -68,7 +68,6 @@ set(TORCH_PYTHON_SRCS
     ${TORCH_SRC_DIR}/csrc/autograd/python_legacy_variable.cpp
     ${TORCH_SRC_DIR}/csrc/autograd/python_variable.cpp
     ${TORCH_SRC_DIR}/csrc/autograd/python_variable_indexing.cpp
-    ${TORCH_SRC_DIR}/csrc/byte_order.cpp
     ${TORCH_SRC_DIR}/csrc/jit/init.cpp
     ${TORCH_SRC_DIR}/csrc/jit/passes/onnx.cpp
     ${TORCH_SRC_DIR}/csrc/jit/passes/onnx/fixup_onnx_loop.cpp
@@ -225,16 +224,14 @@ if (USE_DISTRIBUTED)
     if (NOT MSVC)
       list(APPEND TORCH_PYTHON_SRCS
         ${TORCH_SRC_DIR}/csrc/distributed/autograd/init.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/autograd/context/dist_autograd_container.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/autograd/context/dist_autograd_context.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/comm.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/init.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/rpc/functions.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_functions.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_rpc_handler.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/rpc/request_callback_impl.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/rpc_agent.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/rref_context.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/rref.cpp
diff --git a/torch/csrc/THP.h b/torch/csrc/THP.h
index e8ace1a9d82eb..aee590f1ff081 100644
--- a/torch/csrc/THP.h
+++ b/torch/csrc/THP.h
@@ -28,15 +28,15 @@
 #define THWTensor THTensor
 #define THWTensor_(NAME) THTensor_(NAME)
 
-#include <torch/csrc/PtrWrapper.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Generator.h>
-#include <torch/csrc/Storage.h>
-#include <torch/csrc/Size.h>
 #include <torch/csrc/Module.h>
+#include <torch/csrc/PtrWrapper.h>
+#include <torch/csrc/Size.h>
+#include <torch/csrc/Storage.h>
 #include <torch/csrc/Types.h>
 #include <torch/csrc/utils.h> // This requires defined Storage and Tensor types
-#include <torch/csrc/byte_order.h>
+#include <torch/csrc/utils/byte_order.h>
 
 #ifdef _THP_CORE
 #include <torch/csrc/serialization.h>
diff --git a/torch/csrc/byte_order.h b/torch/csrc/byte_order.h
deleted file mode 100644
index b8b0f7a22c9a7..0000000000000
--- a/torch/csrc/byte_order.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef THP_BYTE_ORDER_H
-#define THP_BYTE_ORDER_H
-
-#include <cstdint>
-#include <cstddef>
-#include <THHalf.h>
-#include <c10/util/BFloat16.h>
-
-enum THPByteOrder {
-  THP_LITTLE_ENDIAN = 0,
-  THP_BIG_ENDIAN = 1
-};
-
-THPByteOrder THP_nativeByteOrder();
-
-void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
-void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
-void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len);
-void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, size_t len);
-void THP_decodeFloatBuffer(float* dst, const uint8_t* src, THPByteOrder order, size_t len);
-void THP_decodeDoubleBuffer(double* dst, const uint8_t* src, THPByteOrder order, size_t len);
-void THP_decodeBoolBuffer(bool* dst, const uint8_t* src, THPByteOrder order, size_t len);
-void THP_decodeBFloat16Buffer(at::BFloat16* dst, const uint8_t* src, THPByteOrder order, size_t len);
-
-void THP_encodeInt16Buffer(uint8_t* dst, const int16_t* src, THPByteOrder order, size_t len);
-void THP_encodeInt32Buffer(uint8_t* dst, const int32_t* src, THPByteOrder order, size_t len);
-void THP_encodeInt64Buffer(uint8_t* dst, const int64_t* src, THPByteOrder order, size_t len);
-void THP_encodeFloatBuffer(uint8_t* dst, const float* src, THPByteOrder order, size_t len);
-void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order, size_t len);
-
-#endif
diff --git a/torch/csrc/distributed/autograd/context/dist_autograd_container.cpp b/torch/csrc/distributed/autograd/context/dist_autograd_container.cpp
index be7759c963ddd..3302e1d7b81f9 100644
--- a/torch/csrc/distributed/autograd/context/dist_autograd_container.cpp
+++ b/torch/csrc/distributed/autograd/context/dist_autograd_container.cpp
@@ -5,56 +5,98 @@ namespace torch {
 namespace distributed {
 namespace autograd {
 
-constexpr int kContextIdBits = 48;
-constexpr int64_t kContextIdMask = (1LL << kContextIdBits) - 1;
+constexpr int kAutoIncrementBits = 48;
+constexpr int64_t kAutoIncrementMask = (1LL << kAutoIncrementBits) - 1;
 constexpr int kMaxWorkerId = 65535;
-constexpr int64_t kMaxContextId = kContextIdMask;
 
-thread_local int64_t DistAutogradContainer::current_context_id_ = -1;
+// Each thread has a single autograd_context_id valid at any point in time.
+static thread_local int64_t current_context_id_ = -1;
+
+// Lock to ensure DistAutogradContainer is initialized only once.
+static std::mutex dist_container_init_lock_;
 
 DistAutogradContainer::DistAutogradContainer()
-    : next_context_id_(0), worker_id_(0), initialized_(false) {}
+    : next_context_id_(0),
+      worker_id_(0),
+      initialized_(false),
+      next_autograd_message_id_(0),
+      max_id_(0) {}
 
 DistAutogradContainer& DistAutogradContainer::init(int64_t worker_id) {
+  std::lock_guard<std::mutex> guard(dist_container_init_lock_);
+
   TORCH_CHECK(
       worker_id >= 0 && worker_id <= kMaxWorkerId,
       "worker_id needs to be in the range [0, 65535]")
 
-  auto& container = getInstance();
+  auto& container = getInstanceInternal();
+  TORCH_CHECK(
+      !container.initialized_,
+      "Container is already initialized! Cannot initialize it twice!");
+
   container.worker_id_ = worker_id;
   container.next_context_id_ = static_cast<int64_t>(worker_id)
-      << kContextIdBits;
+      << kAutoIncrementBits;
+  container.next_autograd_message_id_ = static_cast<int64_t>(worker_id)
+      << kAutoIncrementBits;
+  container.max_id_ =
+      (kAutoIncrementMask |
+       (static_cast<int64_t>(worker_id) << kAutoIncrementBits));
   container.initialized_ = true;
   return container;
 }
 
 DistAutogradContainer& DistAutogradContainer::getInstance() {
+  auto& instance = getInstanceInternal();
+  TORCH_CHECK(
+      instance.initialized_,
+      "Need to initialize distributed autograd using "
+      "torch.distributed.autograd.init()");
+  return instance;
+}
+
+DistAutogradContainer& DistAutogradContainer::getInstanceInternal() {
   static DistAutogradContainer container;
   return container;
 }
 
-const DistAutogradContext& DistAutogradContainer::newContext() {
-  if (!initialized_) {
-    throw std::runtime_error(
-        "Need to initialize distributed autograd using "
-        "torch.distributed.autograd.init()");
+int64_t DistAutogradContainer::newAutogradMessageId() {
+  // Check for overflow into workerId_ section.
+  TORCH_INTERNAL_ASSERT(next_autograd_message_id_ < max_id_);
+  return next_autograd_message_id_++;
+}
+
+DistAutogradContext& DistAutogradContainer::getOrCreateContext(
+    int64_t context_id) {
+  std::lock_guard<std::mutex> guard(autograd_context_lock_);
+  auto it = autograd_context_.find(context_id);
+  if (it != autograd_context_.end()) {
+    return it->second;
   }
 
+  auto& context = autograd_context_
+                      .emplace(
+                          std::piecewise_construct,
+                          std::forward_as_tuple(context_id),
+                          std::forward_as_tuple(context_id))
+                      .first->second;
+  return context;
+}
+
+const DistAutogradContext& DistAutogradContainer::newContext() {
   std::lock_guard<std::mutex> guard(autograd_context_lock_);
-  TORCH_INTERNAL_ASSERT(
-      next_context_id_ < std::numeric_limits<int64_t>::max() &&
-          next_context_id_ <
-              (kMaxContextId |
-               (static_cast<int64_t>(worker_id_) << kContextIdBits)),
-      "We have run out of autograd context ids!!!");
-
-  autograd_context_.emplace(
-      std::piecewise_construct,
-      std::forward_as_tuple(next_context_id_),
-      std::forward_as_tuple(next_context_id_));
-
-  current_context_id_ = next_context_id_;
-  return autograd_context_.at(next_context_id_++);
+  // Check for overflow into workerId_ section.
+  TORCH_INTERNAL_ASSERT(next_context_id_ < max_id_);
+
+  auto& context = autograd_context_
+                      .emplace(
+                          std::piecewise_construct,
+                          std::forward_as_tuple(next_context_id_),
+                          std::forward_as_tuple(next_context_id_))
+                      .first->second;
+
+  current_context_id_ = next_context_id_++;
+  return context;
 }
 
 bool DistAutogradContainer::hasValidContext() const {
@@ -90,8 +132,8 @@ void DistAutogradContainer::releaseContext(int64_t context_id) {
   }
 }
 
-const DistAutogradContext& DistAutogradContainer::retrieveContext(
-    int64_t context_id) const {
+DistAutogradContext& DistAutogradContainer::retrieveContext(
+    int64_t context_id) {
   std::lock_guard<std::mutex> guard(autograd_context_lock_);
   TORCH_CHECK(
       autograd_context_.find(context_id) != autograd_context_.end(),
@@ -100,6 +142,10 @@ const DistAutogradContext& DistAutogradContainer::retrieveContext(
   return autograd_context_.at(context_id);
 }
 
+int64_t DistAutogradContainer::getMaxId() {
+  return max_id_;
+}
+
 } // namespace autograd
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/autograd/context/dist_autograd_container.h b/torch/csrc/distributed/autograd/context/dist_autograd_container.h
index 22ddff5592172..3f0aa846b0fab 100644
--- a/torch/csrc/distributed/autograd/context/dist_autograd_container.h
+++ b/torch/csrc/distributed/autograd/context/dist_autograd_container.h
@@ -18,12 +18,18 @@ namespace autograd {
 // autograd_context_id. The autograd_context_id itself is a 64 bit globally
 // unique id. The first 16 bits is the worker_id and the next 48 bits is an
 // auto-incrementing id for each worker.
-class DistAutogradContainer {
+//
+// This container is also responsible for maintaining a globally unique message
+// id, which is used to associate send/recv autograd function pairs. The format
+// is similar to the autograd_context_id where we have a 64 bit integer with
+// first 16 bits being the worker id and next 48 bits are auto-incrementing.
+class TORCH_API DistAutogradContainer {
  public:
   // One time initialization of the container.
   static DistAutogradContainer& init(int64_t worker_id);
 
-  // Retrieve the singleton instance of the container.
+  // Retrieve the singleton instance of the container, ensures we have
+  // initialized the container.
   static DistAutogradContainer& getInstance();
 
   // Create a new context for a distributed autograd pass.
@@ -33,7 +39,7 @@ class DistAutogradContainer {
   void releaseContext(int64_t context_id);
 
   // Retrieve the autograd context for a given context_id.
-  const DistAutogradContext& retrieveContext(int64_t context_id) const;
+  DistAutogradContext& retrieveContext(int64_t context_id);
 
   // Retrieves the currently active autograd context for the current thread.
   DistAutogradContext& currentContext();
@@ -41,6 +47,18 @@ class DistAutogradContainer {
   // Checks whether or not the current thread has a valid autograd context.
   bool hasValidContext() const;
 
+  // Generate a new autograd_message_id for send/recv autograd functions.
+  int64_t newAutogradMessageId();
+
+  // Creates a new autograd context with the provided context_id. If a context
+  // already exists with the provided context_id, we just return it.
+  // This does not set the current context for the current thread.
+  DistAutogradContext& getOrCreateContext(int64_t context_id);
+
+  // Retrieves the maximum possible autograd_context_id/autograd_message_id that
+  // can be generated by this worker.
+  int64_t getMaxId();
+
  private:
   DistAutogradContainer();
   ~DistAutogradContainer() = default;
@@ -50,6 +68,8 @@ class DistAutogradContainer {
   DistAutogradContainer(DistAutogradContainer&&) = delete;
   DistAutogradContainer& operator=(DistAutogradContainer&&) = delete;
 
+  static DistAutogradContainer& getInstanceInternal();
+
   // Auto incrementing context id used to identify unique autograd passes.
   // Initialized with the first 16 bits being the worker_id.
   int64_t next_context_id_;
@@ -67,8 +87,11 @@ class DistAutogradContainer {
   // and worker_id_ are immutable.
   mutable std::mutex autograd_context_lock_;
 
-  // Each thread has a single autograd_context_id valid at any point in time.
-  static thread_local int64_t current_context_id_;
+  // Autograd message id to identify unique send/recv autograd function pairs.
+  std::atomic<int64_t> next_autograd_message_id_;
+
+  // Maximum allowed value for autograd_context_id or autograd_message_id.
+  int64_t max_id_;
 };
 
 } // namespace autograd
diff --git a/torch/csrc/distributed/autograd/context/dist_autograd_context.cpp b/torch/csrc/distributed/autograd/context/dist_autograd_context.cpp
index 8ad60c7b59ed3..0323ad849aa2e 100644
--- a/torch/csrc/distributed/autograd/context/dist_autograd_context.cpp
+++ b/torch/csrc/distributed/autograd/context/dist_autograd_context.cpp
@@ -13,16 +13,41 @@ int64_t DistAutogradContext::context_id() const {
 }
 
 void DistAutogradContext::addSendFunction(
-    const std::shared_ptr<SendRpcBackward>& func) {
+    const std::shared_ptr<SendRpcBackward>& func,
+    int64_t autograd_message_id) {
+  TORCH_INTERNAL_ASSERT(func != nullptr);
+
+  std::lock_guard<std::mutex> guard(lock_);
+  TORCH_INTERNAL_ASSERT(
+      sendAutogradFunctions_.find(autograd_message_id) ==
+      sendAutogradFunctions_.end());
+  sendAutogradFunctions_.emplace(autograd_message_id, func);
+}
+
+void DistAutogradContext::addRecvFunction(
+    std::shared_ptr<RecvRpcBackward>& func,
+    int64_t autograd_message_id) {
+  TORCH_INTERNAL_ASSERT(func != nullptr);
+
   std::lock_guard<std::mutex> guard(lock_);
-  sendAutogradFunctions_.push_back(func);
+  TORCH_INTERNAL_ASSERT(
+      recvAutogradFunctions_.find(autograd_message_id) ==
+      recvAutogradFunctions_.end());
+  recvAutogradFunctions_.emplace(autograd_message_id, func);
 }
 
-std::vector<std::shared_ptr<SendRpcBackward>> DistAutogradContext::
-    sendFunctions() const {
+std::unordered_map<int64_t, std::shared_ptr<SendRpcBackward>>
+DistAutogradContext::sendFunctions() const {
+  std::lock_guard<std::mutex> guard(lock_);
   return sendAutogradFunctions_;
 }
 
+std::unordered_map<int64_t, std::shared_ptr<RecvRpcBackward>>
+DistAutogradContext::recvFunctions() const {
+  std::lock_guard<std::mutex> guard(lock_);
+  return recvAutogradFunctions_;
+}
+
 } // namespace autograd
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/autograd/context/dist_autograd_context.h b/torch/csrc/distributed/autograd/context/dist_autograd_context.h
index c4eec7e96d38c..1ebd91e9f57ef 100644
--- a/torch/csrc/distributed/autograd/context/dist_autograd_context.h
+++ b/torch/csrc/distributed/autograd/context/dist_autograd_context.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <torch/csrc/distributed/autograd/functions/recvrpc_backward.h>
 #include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
 #include <cstdint>
 
@@ -9,17 +10,30 @@ namespace autograd {
 
 // DistAutogradContext which stores information for a single distributed
 // autograd pass on a worker.
-class DistAutogradContext {
+class TORCH_API DistAutogradContext {
  public:
   explicit DistAutogradContext(int64_t context_id);
 
   // Retrieves the autograd context id for this context.
   int64_t context_id() const;
 
-  // Records a 'send' autograd function for this context.
-  void addSendFunction(const std::shared_ptr<SendRpcBackward>& func);
+  // Records a 'send' autograd function for this context with the provided
+  // message id.
+  void addSendFunction(
+      const std::shared_ptr<SendRpcBackward>& func,
+      int64_t autograd_message_id);
 
-  std::vector<std::shared_ptr<SendRpcBackward>> sendFunctions() const;
+  // Records a 'recv' autograd function for this context with the provided
+  // message id.
+  void addRecvFunction(
+      std::shared_ptr<RecvRpcBackward>& func,
+      int64_t autograd_message_id);
+
+  std::unordered_map<int64_t, std::shared_ptr<SendRpcBackward>> sendFunctions()
+      const;
+
+  std::unordered_map<int64_t, std::shared_ptr<RecvRpcBackward>> recvFunctions()
+      const;
 
   DistAutogradContext(const DistAutogradContext&) = delete;
   DistAutogradContext& operator=(const DistAutogradContext&) = delete;
@@ -29,7 +43,13 @@ class DistAutogradContext {
  private:
   const int64_t context_id_;
 
-  std::vector<std::shared_ptr<SendRpcBackward>> sendAutogradFunctions_;
+  // Map from autograd_message_id to appropriate 'send' autograd function.
+  std::unordered_map<int64_t, std::shared_ptr<SendRpcBackward>>
+      sendAutogradFunctions_;
+
+  // Map from autograd_message_id to appropriate 'recv' autograd function.
+  std::unordered_map<int64_t, std::shared_ptr<RecvRpcBackward>>
+      recvAutogradFunctions_;
 
   // Lock to protect concurrent modification of the context.
   mutable std::mutex lock_;
diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
new file mode 100644
index 0000000000000..aeecbb4a8906e
--- /dev/null
+++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.cpp
@@ -0,0 +1,26 @@
+#include <torch/csrc/distributed/autograd/functions/recvrpc_backward.h>
+#include <ATen/core/functional.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+using torch::autograd::Variable;
+
+torch::autograd::variable_list RecvRpcBackward::apply(
+    torch::autograd::variable_list&& grads) {
+  std::vector<Variable> outputGrads;
+  for (const auto& grad : grads) {
+    if (grad.defined()) {
+      outputGrads.emplace_back(grad);
+    } else {
+      outputGrads.emplace_back(at::zeros_like(grad));
+    }
+  }
+
+  return outputGrads;
+}
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/autograd/functions/recvrpc_backward.h b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
new file mode 100644
index 0000000000000..fbb2e9ff3ee3c
--- /dev/null
+++ b/torch/csrc/distributed/autograd/functions/recvrpc_backward.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/autograd/function.h>
+
+namespace torch {
+namespace distributed {
+namespace autograd {
+
+// As part of our distributed autograd implementation, whenever we receive an
+// RPC from a node, we add a 'RecvRpcBackward' autograd function to the
+// autograd graph. This is more or less a placeholder function that is used to
+// pass gradients to the remote host during the backward pass. The inputs to the
+// RPC function are the inputs to this autograd function.
+struct TORCH_API RecvRpcBackward : public torch::autograd::Node {
+  torch::autograd::variable_list apply(
+      torch::autograd::variable_list&& grads) override;
+};
+
+} // namespace autograd
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/autograd/init.cpp b/torch/csrc/distributed/autograd/init.cpp
index 021c402943119..fa214df06a4b0 100644
--- a/torch/csrc/distributed/autograd/init.cpp
+++ b/torch/csrc/distributed/autograd/init.cpp
@@ -30,11 +30,26 @@ PyObject* dist_autograd_init(PyObject* /* unused */) {
               "_context_id",
               &DistAutogradContext::context_id,
               py::call_guard<py::gil_scoped_release>())
+          .def(
+              "_recv_functions",
+              [](const DistAutogradContext& ctx) {
+                std::map<int64_t, py::object> funcs;
+                for (const auto& map_entry : ctx.recvFunctions()) {
+                  funcs.emplace(
+                      map_entry.first,
+                      py::reinterpret_steal<py::object>(
+                          torch::autograd::functionToPyObject(
+                              map_entry.second)));
+                }
+                return funcs;
+              })
           .def("_send_functions", [](const DistAutogradContext& ctx) {
-            std::vector<py::object> funcs;
-            for (const auto& sendFunction : ctx.sendFunctions()) {
-              funcs.push_back(py::reinterpret_steal<py::object>(
-                  torch::autograd::functionToPyObject(sendFunction)));
+            std::map<int64_t, py::object> funcs;
+            for (const auto& map_entry : ctx.sendFunctions()) {
+              funcs.emplace(
+                  map_entry.first,
+                  py::reinterpret_steal<py::object>(
+                      torch::autograd::functionToPyObject(map_entry.second)));
             }
             return funcs;
           });
@@ -50,6 +65,10 @@ PyObject* dist_autograd_init(PyObject* /* unused */) {
     return DistAutogradContainer::getInstance().releaseContext(context_id);
   });
 
+  module.def("_get_max_id", []() {
+    return DistAutogradContainer::getInstance().getMaxId();
+  });
+
   module.def(
       "_retrieve_context",
       [](int64_t context_id) -> const DistAutogradContext& {
diff --git a/torch/csrc/distributed/autograd/utils.cpp b/torch/csrc/distributed/autograd/utils.cpp
index e85bad03f6a3f..3eb19d45d4006 100644
--- a/torch/csrc/distributed/autograd/utils.cpp
+++ b/torch/csrc/distributed/autograd/utils.cpp
@@ -1,24 +1,55 @@
 #include <torch/csrc/autograd/functions/utils.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_container.h>
+#include <torch/csrc/distributed/autograd/functions/recvrpc_backward.h>
+#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
 #include <torch/csrc/distributed/autograd/utils.h>
 
 namespace torch {
 namespace distributed {
 namespace autograd {
 
-std::shared_ptr<SendRpcBackward> addSendRpcBackward(
-    const std::vector<torch::Tensor>& tensors) {
+using torch::distributed::rpc::Message;
+
+void addSendRpcBackward(
+    DistAutogradContext& autogradContext,
+    const torch::distributed::rpc::AutogradMetadata& autogradMetadata,
+    std::vector<torch::Tensor>& tensors) {
   // Attach the appropriate autograd edges.
-  std::shared_ptr<SendRpcBackward> grad_fn;
   if (torch::autograd::compute_requires_grad(tensors)) {
-    grad_fn = std::make_shared<SendRpcBackward>();
+    auto grad_fn = std::make_shared<SendRpcBackward>();
     grad_fn->set_next_edges(torch::autograd::collect_next_edges(tensors));
 
     // Add the appropriate input metadata for the grad_fn.
     for (const auto& tensor : tensors) {
       grad_fn->add_input_metadata(tensor);
     }
+
+    // Record the send autograd function in our current context.
+    autogradContext.addSendFunction(
+        grad_fn, autogradMetadata.autogradMessageId);
+  }
+}
+
+DistAutogradContext* addRecvRpcBackward(
+    const torch::distributed::rpc::AutogradMetadata& autogradMetadata,
+    std::vector<torch::Tensor>& tensors) {
+  if (torch::autograd::compute_requires_grad(tensors)) {
+    // Attach the tensors as inputs to the autograd function.
+    auto grad_fn = std::make_shared<RecvRpcBackward>();
+    for (auto& tensor : tensors) {
+      torch::autograd::set_history(tensor, grad_fn);
+    }
+
+    // Now update the autograd context with the necessary information.
+    auto& autogradContainer = DistAutogradContainer::getInstance();
+    // Initialize autograd context if necessary.
+    DistAutogradContext& autogradContext = autogradContainer.getOrCreateContext(
+        autogradMetadata.autogradContextId);
+    autogradContext.addRecvFunction(
+        grad_fn, autogradMetadata.autogradMessageId);
+    return &autogradContext;
   }
-  return grad_fn;
+  return nullptr;
 }
 
 } // namespace autograd
diff --git a/torch/csrc/distributed/autograd/utils.h b/torch/csrc/distributed/autograd/utils.h
index 6e04605711b86..3126f6392b07f 100644
--- a/torch/csrc/distributed/autograd/utils.h
+++ b/torch/csrc/distributed/autograd/utils.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <torch/csrc/distributed/autograd/functions/sendrpc_backward.h>
-#include <torch/types.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_context.h>
+#include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
 
 namespace torch {
 namespace distributed {
@@ -9,12 +9,26 @@ namespace autograd {
 
 // This method is used to attach the 'send' autograd function to the autograd
 // graph when we use RPC. This method creates a new 'send' autograd function
-// and attaches the provided tensors as next_edges to the 'send' function.
+// and attaches the provided tensors as next_edges to the 'send' function. In
+// addition to this, it also registers the send function in the provided
+// autograd context. Finally, the RPC message is updated with appropriate
+// autograd information for the recipient.
+TORCH_API void addSendRpcBackward(
+    DistAutogradContext& autogradContext,
+    const torch::distributed::rpc::AutogradMetadata& autogradMetadata,
+    std::vector<torch::Tensor>& tensors);
+
+// This method is used to attach the 'recv' autograd function to the autograd
+// graph when we use RPC. This method creates a new 'recv' autograd function
+// and attaches the provided tensors as inputs to the 'recv' function. It
+// creates a new autograd context if needed and registers the 'recv' function
+// with this context.
 //
-// Returns a shared_ptr to the autograd function, so that we can hold a
-// reference to it.
-TORCH_API std::shared_ptr<SendRpcBackward> addSendRpcBackward(
-    const std::vector<torch::Tensor>& tensors);
+// Returns a pointer to the autograd context created (nullptr in case of no
+// autograd information was needed.)
+TORCH_API DistAutogradContext* addRecvRpcBackward(
+    const torch::distributed::rpc::AutogradMetadata& autogradMetadata,
+    std::vector<torch::Tensor>& tensors);
 
 } // namespace autograd
 } // namespace distributed
diff --git a/torch/csrc/distributed/rpc/functions.cpp b/torch/csrc/distributed/rpc/functions.cpp
deleted file mode 100644
index 5d5d054888759..0000000000000
--- a/torch/csrc/distributed/rpc/functions.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <torch/csrc/distributed/rpc/functions.h>
-
-#include <torch/csrc/distributed/rpc/future_message.h>
-#include <torch/csrc/distributed/rpc/python_rpc_handler.h>
-#include <torch/csrc/distributed/rpc/rref.h>
-#include <torch/csrc/distributed/rpc/rref_context.h>
-#include <torch/csrc/distributed/rpc/script_call.h>
-#include <torch/csrc/distributed/rpc/script_remote_call.h>
-#include <torch/csrc/distributed/rpc/script_ret.h>
-#include <torch/csrc/distributed/rpc/script_rref_proto.h>
-
-namespace torch {
-namespace distributed {
-namespace rpc {
-
-Message createException(const Message& request, const std::exception& e) {
-  const char* err = e.what();
-  std::vector<char> payload(err, err + strlen(err));
-  return Message(
-      std::move(payload),
-      std::vector<torch::Tensor>(),
-      MessageType::EXCEPTION,
-      request.id());
-}
-
-Message processRequestBlocking(Message&& request) {
-  switch (request.type()) {
-    case MessageType::SCRIPT_CALL: {
-      try {
-        ScriptCall sc = ScriptCall::fromMessage(request);
-
-        // sc is only alive within this block, use reference to avoid copy
-        auto& stack = sc.stackRef();
-        sc.op()->getOperation()(stack);
-
-        AT_ASSERT(
-            stack.size() == 1,
-            "Return value of a builtin operator or a "
-            "TorchScript function should be a single IValue, got a vector of "
-            "size ",
-            stack.size());
-        auto response = ScriptRet(std::move(stack.front())).toMessage();
-
-        response.setId(request.id());
-        return response;
-      } catch (std::exception& e) {
-        return createException(request, e);
-      }
-      break;
-    }
-    case MessageType::PYTHON_CALL: {
-      try {
-        std::vector<torch::Tensor> tensorTable;
-        auto payload = PythonRpcHandler::getInstance().generatePythonUDFResult(
-            request, tensorTable);
-        return Message(
-            std::move(payload),
-            std::move(tensorTable),
-            MessageType::PYTHON_RET,
-            request.id());
-      } catch (std::exception& e) {
-        return createException(request, e);
-      }
-      break;
-    }
-    case MessageType::REMOTE_CALL: {
-      ScriptRemoteCall src = ScriptRemoteCall::fromMessage(request);
-
-      auto rrefId = RRefId::fromIValue(src.retRRefId());
-      auto forkId = ForkId::fromIValue(src.retForkId());
-      TORCH_CHECK(rrefId != forkId, "Does not support remote call to self.");
-
-      auto& ctx = RRefContext::getInstance();
-      auto ownerRRef = ctx->getOrCreateOwnerRRef<IValue>(rrefId);
-
-      // TODO: make this asynchronous
-      // src is only alive within this block, use reference to avoid copy
-      auto& stack = src.stackRef();
-      src.op()->getOperation()(stack);
-      AT_ASSERT(
-          stack.size() == 1,
-          "Return value of a builtin operator or a "
-          "TorchScript function should be a single IValue, got a vector of "
-          "size ",
-          stack.size());
-
-      ownerRRef->setValue(std::move(stack.front()));
-      return Message();
-    }
-    case MessageType::RREF_FETCH_CALL: {
-      ScriptRRefFetchCall srf = ScriptRRefFetchCall::fromMessage(request);
-      // TODO: make this asynchronous
-      std::shared_ptr<OwnerRRef<IValue>> rref =
-          RRefContext::getInstance()->getOrCreateOwnerRRef<IValue>(
-              RRefId::fromIValue(srf.value()));
-      auto response = ScriptRRefFetchRet(rref->getValue()).toMessage();
-      response.setId(request.id());
-      return response;
-    }
-    case MessageType::RREF_USER_CREATE: {
-      ScriptRRefCreate sra = ScriptRRefCreate::fromMessage(request);
-      RRefContext::getInstance()->addFork(sra.valueRef());
-      return Message();
-    }
-    case MessageType::RREF_USER_DELETE: {
-      ScriptRRefDelete srd = ScriptRRefDelete::fromMessage(request);
-      RRefContext::getInstance()->delFork(srd.valueRef());
-      return Message();
-    }
-    default: {
-      AT_ERROR("Request type ", request.type(), " not supported.");
-    }
-  }
-}
-
-} // namespace rpc
-} // namespace distributed
-} // namespace torch
diff --git a/torch/csrc/distributed/rpc/functions.h b/torch/csrc/distributed/rpc/functions.h
deleted file mode 100644
index d5f82885712f7..0000000000000
--- a/torch/csrc/distributed/rpc/functions.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#pragma once
-
-#include <torch/csrc/distributed/rpc/message.h>
-#include <torch/csrc/distributed/rpc/rpc_agent.h>
-
-namespace torch {
-namespace distributed {
-namespace rpc {
-
-Message processRequestBlocking(Message&& message);
-
-Message createException(const Message& request, const std::exception& e);
-
-} // namespace rpc
-} // namespace distributed
-} // namespace torch
diff --git a/torch/csrc/distributed/rpc/future_message.cpp b/torch/csrc/distributed/rpc/future_message.cpp
index bd0c725986671..c8030708ac7c9 100644
--- a/torch/csrc/distributed/rpc/future_message.cpp
+++ b/torch/csrc/distributed/rpc/future_message.cpp
@@ -7,6 +7,7 @@ namespace rpc {
 const Message& FutureMessage::wait() {
   std::unique_lock<std::mutex> lock(mutex_);
   finished_cv_.wait(lock, [this] { return completed_.load(); });
+
   return message_;
 }
 
@@ -26,13 +27,6 @@ void FutureMessage::markCompleted() {
   markCompleted(Message());
 }
 
-const Message& FutureMessage::message() {
-  std::unique_lock<std::mutex> lock(mutex_);
-  TORCH_CHECK(completed(), "Cannot retrieve message before completed.");
-
-  return message_;
-}
-
 bool FutureMessage::completed() const {
   return completed_;
 }
@@ -44,17 +38,17 @@ void FutureMessage::addCallback(const FutureMessage::Callback& callback) {
     callback(message_);
     return;
   }
-  callbacks.push_back(callback);
+  callbacks_.push_back(callback);
 }
 
 void FutureMessage::fireCallbacks() {
   TORCH_CHECK(completed(), "Firing callbacks on incomplete FutureMessage.");
-  // There is no need to protect callbacks with the lock.
+  // There is no need to protect callbacks_ with the lock.
   // Once completed_ is set to true, no one can add new callback to the list.
-  for (auto& callback : callbacks) {
+  for (auto& callback : callbacks_) {
     callback(message_);
   }
-  callbacks.clear();
+  callbacks_.clear();
 }
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/future_message.h b/torch/csrc/distributed/rpc/future_message.h
index 675582ef87bc8..0009f310937d4 100644
--- a/torch/csrc/distributed/rpc/future_message.h
+++ b/torch/csrc/distributed/rpc/future_message.h
@@ -18,7 +18,6 @@ struct TORCH_API FutureMessage final {
   const Message& wait();
   void markCompleted(Message message);
   void markCompleted();
-  const Message& message();
   bool completed() const;
 
   // If completed() the callback will be invoked in-place.
@@ -27,10 +26,10 @@ struct TORCH_API FutureMessage final {
  private:
   void fireCallbacks();
 
-  std::mutex mutex_;
+  mutable std::mutex mutex_;
   std::atomic_bool completed_{false}; // is this future complete
   std::condition_variable finished_cv_;
-  std::vector<Callback> callbacks;
+  std::vector<Callback> callbacks_;
   // TODO: make message_ an optional field, and get rid of UNKNOWN message type
   Message message_;
 };
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index 9d732e55f3bd6..e70e89a3ff653 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/python_headers.h>
 
-#include <torch/csrc/distributed/rpc/functions.h>
 #include <torch/csrc/distributed/rpc/future_message.h>
 #include <torch/csrc/distributed/rpc/process_group_agent.h>
 #include <torch/csrc/distributed/rpc/python_functions.h>
diff --git a/torch/csrc/distributed/rpc/message.cpp b/torch/csrc/distributed/rpc/message.cpp
index cbfa567c868b6..f518421565053 100644
--- a/torch/csrc/distributed/rpc/message.cpp
+++ b/torch/csrc/distributed/rpc/message.cpp
@@ -44,10 +44,18 @@ void Message::swap(Message& rhs) noexcept {
   std::swap(id_, rhs.id_);
 }
 
+std::vector<char>&& Message::movePayload() && {
+  return std::move(payload_);
+}
+
 const std::vector<char>& Message::payload() const {
   return payload_;
 }
 
+std::vector<torch::Tensor>& Message::tensors() {
+  return tensors_;
+}
+
 const std::vector<torch::Tensor>& Message::tensors() const {
   return tensors_;
 }
@@ -59,6 +67,7 @@ const MessageType& Message::type() const {
 bool Message::isRequest() const {
   return MessageType::SCRIPT_CALL == type_ ||
       MessageType::PYTHON_CALL == type_ || MessageType::REMOTE_CALL == type_ ||
+      MessageType::MESSAGE_WITH_AUTOGRAD_REQ == type_ ||
       MessageType::RREF_FETCH_CALL == type_ ||
       MessageType::RREF_USER_CREATE == type_ ||
       MessageType::RREF_USER_DELETE == type_;
@@ -67,12 +76,14 @@ bool Message::isRequest() const {
 bool Message::requiresResponse() const {
   return MessageType::SCRIPT_CALL == type_ ||
       MessageType::PYTHON_CALL == type_ ||
+      MessageType::MESSAGE_WITH_AUTOGRAD_REQ == type_ ||
       MessageType::RREF_FETCH_CALL == type_;
 }
 
 bool Message::isResponse() const {
   return MessageType::SCRIPT_RET == type_ || MessageType::PYTHON_RET == type_ ||
-      MessageType::RREF_FETCH_RET == type_;
+      MessageType::RREF_FETCH_RET == type_ ||
+      MessageType::MESSAGE_WITH_AUTOGRAD_RESP == type_;
 }
 
 bool Message::isShutdown() const {
diff --git a/torch/csrc/distributed/rpc/message.h b/torch/csrc/distributed/rpc/message.h
index eea216f7354a7..76fbd9f805c6a 100644
--- a/torch/csrc/distributed/rpc/message.h
+++ b/torch/csrc/distributed/rpc/message.h
@@ -17,6 +17,8 @@ enum MessageType {
   RREF_FETCH_RET,
   RREF_USER_CREATE,
   RREF_USER_DELETE,
+  MESSAGE_WITH_AUTOGRAD_REQ,
+  MESSAGE_WITH_AUTOGRAD_RESP,
   SHUTDOWN,
   EXCEPTION,
   UNKNOWN
@@ -38,8 +40,8 @@ enum MessageType {
 //                  request and response. Other implementation can ignore it
 //                  if they have their own ways to do matching.
 //
-// Layers above ``RpcAgent`` only converts ScriptCall, ScriptRet, PythonCall,
-// and PythonRet into a Message, and it is up to the RpcAgent
+// Layers above ``RpcAgent`` only converts ScriptCall, ScriptResp, PythonCall,
+// and PythonUDFResp into a Message, and it is up to the RpcAgent
 // implementation to determine how to serialize a message.
 class TORCH_API Message final {
  public:
@@ -62,7 +64,11 @@ class TORCH_API Message final {
   Message& operator=(Message&& rhs) &;
   void swap(Message& rhs) noexcept;
 
+  // Destructively retrieves the payload.
+  std::vector<char>&& movePayload() &&;
+
   const std::vector<char>& payload() const;
+  std::vector<torch::Tensor>& tensors();
   const std::vector<torch::Tensor>& tensors() const;
   const MessageType& type() const;
 
diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp
index a8c96f0a52f4f..0609fa2b08a31 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@@ -1,5 +1,7 @@
 #include <torch/csrc/distributed/rpc/process_group_agent.h>
+#include <c10/util/C++17.h>
 #include <c10d/ProcessGroup.hpp>
+#include <torch/csrc/distributed/rpc/request_callback_impl.h>
 
 #include <Python.h>
 
@@ -22,7 +24,7 @@ void serialize(const Message& message, std::ostream& os) {
   std::vector<torch::Tensor> tensors = message.tensors();
   // append payload as a tensor
   tensors.push_back(torch::from_blob(payload, payload_size, {torch::kChar}));
-  // append id as a tensor
+  // append id and autograd metadata as a tensor
   tensors.push_back(torch::tensor({message.id()}, {torch::kInt64}));
 
   torch::save(tensors, os);
@@ -39,6 +41,7 @@ Message deserialize(MessageType type, std::istream& is) {
   auto payloadTensor = std::move(tensors.back());
   tensors.pop_back();
 
+  TORCH_INTERNAL_ASSERT(1, idTensor.numel());
   int64_t id = idTensor.storage().data<int64_t>()[0];
 
   std::vector<char> payload(payloadTensor.numel());
@@ -107,7 +110,7 @@ ProcessGroupAgent::ProcessGroupAgent(
     int numSendRecvThreads)
     : RpcAgent(
           WorkerId(std::move(workerName), pg->getRank()),
-          processRequestBlocking),
+          c10::guts::make_unique<RequestCallbackImpl>()),
       pg_(std::move(pg)),
       sendCounts_(pg_->getSize()),
       recvCounts_(pg_->getSize()),
@@ -241,7 +244,7 @@ void ProcessGroupAgent::sync() {
   } while (hasPendingMessage());
 }
 
-std::shared_ptr<FutureMessage> ProcessGroupAgent::sendImpl(
+std::shared_ptr<FutureMessage> ProcessGroupAgent::send(
     const WorkerId& to,
     Message&& message) {
   TORCH_CHECK(
@@ -337,11 +340,11 @@ void ProcessGroupAgent::enqueueRecv(RecvWork work) {
             (char*)payload.storage().data<signed char>(), payload.numel()));
 
         Message message = deserialize(work.type_, ss);
-
         if (message.requiresResponse()) {
-          send(work.from_, cb_(std::move(message)));
+          auto response = cb_->operator()(message);
+          send(work.from_, std::move(response));
         } else if (message.isRequest()) {
-          cb_(std::move(message));
+          cb_->operator()(message);
         } else if (message.isResponse()) {
           auto id = message.id();
           {
@@ -351,7 +354,8 @@ void ProcessGroupAgent::enqueueRecv(RecvWork work) {
           }
         } else {
           // TODO: pass the error back to the caller instead of crashing here.
-          AT_ERROR("unrecognized message type ", message.type());
+          TORCH_INTERNAL_ASSERT(
+              false, "unrecognized message type ", message.type());
         }
 
         recvCounts_.increment(work.from_.id_);
diff --git a/torch/csrc/distributed/rpc/process_group_agent.h b/torch/csrc/distributed/rpc/process_group_agent.h
index db78950b61115..bf192c8eb68da 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.h
+++ b/torch/csrc/distributed/rpc/process_group_agent.h
@@ -2,7 +2,6 @@
 
 #include <c10/core/thread_pool.h>
 #include <c10d/ProcessGroup.hpp>
-#include <torch/csrc/distributed/rpc/functions.h>
 #include <torch/csrc/distributed/rpc/future_message.h>
 #include <torch/csrc/distributed/rpc/python_rpc_handler.h>
 #include <torch/csrc/distributed/rpc/rpc_agent.h>
@@ -53,7 +52,7 @@ class ProcessGroupAgent : public RpcAgent {
   // This method wraps the destination information and the message into a
   // SendWork object, and put the SendWork into a queue. Another thread will
   // consume SendWork from the queue and send it out.
-  std::shared_ptr<FutureMessage> sendImpl(const WorkerId& to, Message&& message)
+  std::shared_ptr<FutureMessage> send(const WorkerId& to, Message&& message)
       override;
 
  private:
diff --git a/torch/csrc/distributed/rpc/python_functions.cpp b/torch/csrc/distributed/rpc/python_functions.cpp
index 7314d2d1c831a..1b9e4e62ef757 100644
--- a/torch/csrc/distributed/rpc/python_functions.cpp
+++ b/torch/csrc/distributed/rpc/python_functions.cpp
@@ -1,4 +1,10 @@
 #include <torch/csrc/distributed/rpc/python_functions.h>
+#include <c10/util/C++17.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_container.h>
+#include <torch/csrc/distributed/autograd/utils.h>
+#include <torch/csrc/distributed/rpc/python_udf_call.h>
+#include <torch/csrc/distributed/rpc/python_udf_resp.h>
+#include <torch/csrc/distributed/rpc/utils.h>
 
 namespace torch {
 namespace distributed {
@@ -47,10 +53,12 @@ std::shared_ptr<Operator> matchBuiltinOp(
 
 } // namespace
 
-py::object toPyObj(const Message& message) {
-  switch (message.type()) {
+using namespace torch::distributed::autograd;
+
+py::object toPyObjInternal(RpcCommandBase& rpc, MessageType messageType) {
+  switch (messageType) {
     case MessageType::SCRIPT_RET: {
-      ScriptRet ret = ScriptRet::fromMessage(message);
+      auto& ret = static_cast<ScriptResp&>(rpc);
       Stack stack;
       stack.push_back(ret.value());
       {
@@ -61,18 +69,32 @@ py::object toPyObj(const Message& message) {
       }
     }
     case MessageType::PYTHON_RET: {
-      return PythonRpcHandler::getInstance().loadPythonUDFResult(message);
+      // TODO: Try to avoid a copy here.
+      auto& resp = static_cast<PythonUDFResp&>(rpc);
+      return PythonRpcHandler::getInstance().loadPythonUDFResult(
+          resp.pickledPayload(), resp.tensors());
     }
-    case MessageType::EXCEPTION: {
-      std::string err(message.payload().begin(), message.payload().end());
-      throw std::runtime_error(err);
+    case MessageType::MESSAGE_WITH_AUTOGRAD_RESP: {
+      auto& rpcWithAutograd = static_cast<RpcWithAutograd&>(rpc);
+
+      // Attach 'recv' autograd function.
+      addRecvRpcBackward(
+          rpcWithAutograd.autogradMetadata(), rpcWithAutograd.tensors());
+
+      // Handle the original RPC.
+      auto wrappedMessageType = rpcWithAutograd.wrappedMessageType();
+      return toPyObjInternal(rpcWithAutograd.wrappedRpc(), wrappedMessageType);
     }
     default: {
-      AT_ERROR("Unrecognized response message type ", message.type());
+      AT_ERROR("Unrecognized response message type ", messageType);
     }
   }
 }
 
+py::object toPyObj(const Message& message) {
+  return toPyObjInternal(*deserializeResponse(message), message.type());
+}
+
 std::shared_ptr<FutureMessage> pyRpcBuiltin(
     RpcAgent& agent,
     const WorkerId& dst,
@@ -81,7 +103,28 @@ std::shared_ptr<FutureMessage> pyRpcBuiltin(
     const py::kwargs& kwargs) {
   Stack stack;
   auto op = matchBuiltinOp(opName, args, kwargs, stack);
-  return agent.send(dst, ScriptCall(op, std::move(stack)).toMessage());
+  auto scriptCall = c10::guts::make_unique<ScriptCall>(op, std::move(stack));
+  auto& autogradContainer = DistAutogradContainer::getInstance();
+  if (autogradContainer.hasValidContext()) {
+    // Retrieve the appropriate context to modify.
+    auto& autogradContext = autogradContainer.currentContext();
+
+    // Wrap the original rpc with autograd information.
+    AutogradMetadata autogradMetadata(
+        autogradContext.context_id(), autogradContainer.newAutogradMessageId());
+    RpcWithAutograd rpcWithAutograd(
+        MessageType::MESSAGE_WITH_AUTOGRAD_REQ,
+        autogradMetadata,
+        std::move(scriptCall));
+
+    // Record autograd information for 'send'.
+    addSendRpcBackward(
+        autogradContext, autogradMetadata, rpcWithAutograd.tensors());
+
+    return agent.send(dst, std::move(rpcWithAutograd).toMessage());
+  } else {
+    return agent.send(dst, std::move(*scriptCall).toMessage());
+  }
 }
 
 std::shared_ptr<RRef> pyRemoteBuiltin(
@@ -111,11 +154,12 @@ std::shared_ptr<FutureMessage> pyRpcPythonUdf(
     const WorkerId& dst,
     const std::string& pickledPythonUDF,
     std::vector<torch::Tensor>& tensors) {
-  std::vector<char> data(pickledPythonUDF.begin(), pickledPythonUDF.end());
-
   return agent.send(
       dst,
-      Message(std::move(data), std::move(tensors), MessageType::PYTHON_CALL));
+      PythonUDFCall(
+          std::vector<char>(pickledPythonUDF.begin(), pickledPythonUDF.end()),
+          tensors)
+          .toMessage());
 }
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/python_functions.h b/torch/csrc/distributed/rpc/python_functions.h
index 37d1fb7743431..f3a1ffda07bde 100644
--- a/torch/csrc/distributed/rpc/python_functions.h
+++ b/torch/csrc/distributed/rpc/python_functions.h
@@ -8,7 +8,7 @@
 #include <torch/csrc/distributed/rpc/rref_context.h>
 #include <torch/csrc/distributed/rpc/script_call.h>
 #include <torch/csrc/distributed/rpc/script_remote_call.h>
-#include <torch/csrc/distributed/rpc/script_ret.h>
+#include <torch/csrc/distributed/rpc/script_resp.h>
 #include <torch/csrc/jit/pybind_utils.h>
 #include <torch/csrc/utils/pybind.h>
 
diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.cpp b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
index 8f34b22f6d968..b9087e52119f6 100644
--- a/torch/csrc/distributed/rpc/python_rpc_handler.cpp
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
@@ -18,23 +18,26 @@ PythonRpcHandler& PythonRpcHandler::getInstance() {
 }
 
 std::vector<char> PythonRpcHandler::generatePythonUDFResult(
-    const Message& request,
-    std::vector<torch::Tensor>& tensorTable) {
+    const std::vector<char>& pickledPayload,
+    const std::vector<torch::Tensor>& requestTensorTable,
+    std::vector<torch::Tensor>& responseTensorTable) {
   AutoGIL ag;
-  auto pargs = py::bytes(request.payload().data(), request.payload().size());
+  auto pargs = py::bytes(pickledPayload.data(), pickledPayload.size());
   TORCH_CHECK(runUDFFunction_ != nullptr, "runUDFFunction_ is nullptr");
-  py::tuple pres = runUDFFunction_(pargs, request.tensors());
+  py::tuple pres = runUDFFunction_(pargs, requestTensorTable);
   const auto& presStr = pres[0].cast<std::string>();
-  tensorTable = pres[1].cast<std::vector<torch::Tensor>>();
+  responseTensorTable = pres[1].cast<std::vector<torch::Tensor>>();
   std::vector<char> payload(presStr.begin(), presStr.end());
   return payload;
 }
 
-py::object PythonRpcHandler::loadPythonUDFResult(const Message& message) {
+py::object PythonRpcHandler::loadPythonUDFResult(
+    const std::vector<char>& pickledPayload,
+    const std::vector<torch::Tensor>& tensorTable) {
   AutoGIL ag;
-  auto pargs = py::bytes(message.payload().data(), message.payload().size());
+  auto pargs = py::bytes(pickledPayload.data(), pickledPayload.size());
   TORCH_CHECK(loadResultFunction_ != nullptr, "loadResultFunction_ is nullptr");
-  return loadResultFunction_(pargs, message.tensors());
+  return loadResultFunction_(pargs, tensorTable);
 }
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.h b/torch/csrc/distributed/rpc/python_rpc_handler.h
index b2283adee23d3..2fd51b8942dc2 100644
--- a/torch/csrc/distributed/rpc/python_rpc_handler.h
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.h
@@ -18,11 +18,14 @@ class PYBIND11_EXPORT PythonRpcHandler {
   static PythonRpcHandler& getInstance();
   // Execute python UDF, result is pickled to binary string
   std::vector<char> generatePythonUDFResult(
-      const Message& request,
-      std::vector<torch::Tensor>& tensorTable);
+      const std::vector<char>& pickledPayload,
+      const std::vector<torch::Tensor>& requestTensorTable,
+      std::vector<torch::Tensor>& responseTensorTable);
   // Returned python UDF result is pickled binary string, so run python
   // function to unpickle the python UDF result and return py::object to user
-  py::object loadPythonUDFResult(const Message& message);
+  py::object loadPythonUDFResult(
+      const std::vector<char>& pickledPayload,
+      const std::vector<torch::Tensor>& tensorTable);
 
  private:
   PythonRpcHandler();
diff --git a/torch/csrc/distributed/rpc/python_udf_call.cpp b/torch/csrc/distributed/rpc/python_udf_call.cpp
new file mode 100644
index 0000000000000..1a99ba5051de9
--- /dev/null
+++ b/torch/csrc/distributed/rpc/python_udf_call.cpp
@@ -0,0 +1,37 @@
+#include <torch/csrc/distributed/rpc/python_udf_call.h>
+#include <c10/util/C++17.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+PythonUDFCall::PythonUDFCall(
+    std::vector<char> pickledPayload,
+    std::vector<torch::Tensor> tensors)
+    : pickledPayload_(std::move(pickledPayload)),
+      tensors_(std::move(tensors)) {}
+
+Message PythonUDFCall::toMessage() && {
+  return Message(
+      std::move(pickledPayload_),
+      std::move(tensors_),
+      MessageType::PYTHON_CALL);
+}
+
+std::unique_ptr<PythonUDFCall> PythonUDFCall::fromMessage(
+    const Message& message) {
+  return c10::guts::make_unique<PythonUDFCall>(
+      message.payload(), message.tensors());
+}
+
+const std::vector<char>& PythonUDFCall::pickledPayload() const {
+  return pickledPayload_;
+}
+
+const std::vector<torch::Tensor>& PythonUDFCall::tensors() const {
+  return tensors_;
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_udf_call.h b/torch/csrc/distributed/rpc/python_udf_call.h
new file mode 100644
index 0000000000000..b46162dc32a18
--- /dev/null
+++ b/torch/csrc/distributed/rpc/python_udf_call.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// RPC call representing calling a Python UDF over RPC.
+class TORCH_API PythonUDFCall final : public RpcCommandBase {
+ public:
+  explicit PythonUDFCall(
+      std::vector<char> pickledPayload,
+      std::vector<torch::Tensor> tensors);
+
+  Message toMessage() && override;
+
+  static std::unique_ptr<PythonUDFCall> fromMessage(const Message& message);
+
+  const std::vector<char>& pickledPayload() const;
+
+  const std::vector<torch::Tensor>& tensors() const;
+
+ private:
+  std::vector<char> pickledPayload_;
+  std::vector<torch::Tensor> tensors_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_udf_resp.cpp b/torch/csrc/distributed/rpc/python_udf_resp.cpp
new file mode 100644
index 0000000000000..31ce56dfd7cba
--- /dev/null
+++ b/torch/csrc/distributed/rpc/python_udf_resp.cpp
@@ -0,0 +1,35 @@
+#include <torch/csrc/distributed/rpc/python_udf_resp.h>
+#include <c10/util/C++17.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+PythonUDFResp::PythonUDFResp(
+    std::vector<char> pickledPayload,
+    std::vector<torch::Tensor> tensors)
+    : pickledPayload_(std::move(pickledPayload)),
+      tensors_(std::move(tensors)) {}
+
+Message PythonUDFResp::toMessage() && {
+  return Message(
+      std::move(pickledPayload_), std::move(tensors_), MessageType::PYTHON_RET);
+}
+
+std::unique_ptr<PythonUDFResp> PythonUDFResp::fromMessage(
+    const Message& message) {
+  return c10::guts::make_unique<PythonUDFResp>(
+      message.payload(), message.tensors());
+}
+
+const std::vector<char>& PythonUDFResp::pickledPayload() const {
+  return pickledPayload_;
+}
+
+const std::vector<torch::Tensor>& PythonUDFResp::tensors() const {
+  return tensors_;
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_udf_resp.h b/torch/csrc/distributed/rpc/python_udf_resp.h
new file mode 100644
index 0000000000000..dc644b47adba4
--- /dev/null
+++ b/torch/csrc/distributed/rpc/python_udf_resp.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// RPC call representing the response of a Python UDF over RPC.
+class TORCH_API PythonUDFResp final : public RpcCommandBase {
+ public:
+  explicit PythonUDFResp(
+      std::vector<char> pickledPayload,
+      std::vector<torch::Tensor> tensors);
+
+  Message toMessage() && override;
+
+  static std::unique_ptr<PythonUDFResp> fromMessage(const Message& message);
+
+  const std::vector<char>& pickledPayload() const;
+
+  const std::vector<torch::Tensor>& tensors() const;
+
+ private:
+  std::vector<char> pickledPayload_;
+  std::vector<torch::Tensor> tensors_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback.cpp b/torch/csrc/distributed/rpc/request_callback.cpp
new file mode 100644
index 0000000000000..cdb9a8993946b
--- /dev/null
+++ b/torch/csrc/distributed/rpc/request_callback.cpp
@@ -0,0 +1,35 @@
+#include <torch/csrc/distributed/rpc/request_callback.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_container.h>
+#include <torch/csrc/distributed/autograd/utils.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+using namespace torch::distributed::autograd;
+
+namespace {
+
+Message createException(const Message& request, const std::exception& e) {
+  const char* err = e.what();
+  std::vector<char> payload(err, err + strlen(err));
+  return Message(
+      std::move(payload),
+      std::vector<torch::Tensor>(),
+      MessageType::EXCEPTION,
+      request.id());
+}
+
+} // anonymous namespace
+
+Message RequestCallback::operator()(Message& request) const {
+  try {
+    return processMessage(request);
+  } catch (std::exception& e) {
+    return createException(request, e);
+  }
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback.h b/torch/csrc/distributed/rpc/request_callback.h
new file mode 100644
index 0000000000000..7771e13820bed
--- /dev/null
+++ b/torch/csrc/distributed/rpc/request_callback.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Functor which is invoked to process an RPC message. This is an abstract class
+// with some common functionality across all request handlers. Users need to
+// implement this interface to perform the actual business logic.
+class TORCH_API RequestCallback {
+ public:
+  // Invoke the callback.
+  Message operator()(Message& request) const;
+
+  virtual ~RequestCallback() {}
+
+ protected:
+  // RpcAgent implementation should invoke ``RequestCallback`` to process
+  // received requests. There is no restriction on the implementation's
+  // threading model. This function takes an rvalue reference of the Message
+  // object. It is expected to return the response message or message
+  // containing an exception. Different rpc agent implementations are expected
+  // to ensure delivery of the response/exception based on their implementation
+  // specific mechanisms.
+  virtual Message processMessage(Message& request) const = 0;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
new file mode 100644
index 0000000000000..d9eeabec64c87
--- /dev/null
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -0,0 +1,153 @@
+#include <torch/csrc/distributed/rpc/request_callback_impl.h>
+#include <c10/util/C++17.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_container.h>
+#include <torch/csrc/distributed/autograd/context/dist_autograd_context.h>
+#include <torch/csrc/distributed/autograd/utils.h>
+#include <torch/csrc/distributed/rpc/future_message.h>
+#include <torch/csrc/distributed/rpc/python_rpc_handler.h>
+#include <torch/csrc/distributed/rpc/python_udf_call.h>
+#include <torch/csrc/distributed/rpc/python_udf_resp.h>
+#include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
+#include <torch/csrc/distributed/rpc/rref.h>
+#include <torch/csrc/distributed/rpc/rref_context.h>
+#include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/script_remote_call.h>
+#include <torch/csrc/distributed/rpc/script_resp.h>
+#include <torch/csrc/distributed/rpc/script_rref_proto.h>
+#include <torch/csrc/distributed/rpc/utils.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+using namespace torch::distributed::autograd;
+
+std::unique_ptr<RpcCommandBase> RequestCallbackImpl::processRpc(
+    RpcCommandBase& rpc,
+    MessageType messageType) const {
+  // TODO: RpcCommandBase should have an abstract execute() method that we can
+  // call here instead of having another switch statement here. Even better we
+  // could have abstract classes RpcRequest and RpcResp which inherit from
+  // RpcCommandBase and RpcRequest declares the abstract method execute() that
+  // we can call here. RpcResponse could have an abstract method to convert it
+  // to a python object.
+  switch (messageType) {
+    case MessageType::SCRIPT_CALL: {
+      auto& scriptCall = static_cast<ScriptCall&>(rpc);
+
+      // sc is only alive within this block, use reference to avoid copy
+      auto& stack = scriptCall.stackRef();
+      scriptCall.op()->getOperation()(stack);
+
+      TORCH_INTERNAL_ASSERT(
+          stack.size() == 1,
+          "Return value of a builtin operator or a "
+          "TorchScript function should be a single IValue, got a vector of "
+          "size ",
+          stack.size());
+
+      return c10::guts::make_unique<ScriptResp>(std::move(stack.front()));
+    }
+    case MessageType::PYTHON_CALL: {
+      auto& pyCall = static_cast<PythonUDFCall&>(rpc);
+      std::vector<torch::Tensor> responseTensorTable;
+      auto payload = PythonRpcHandler::getInstance().generatePythonUDFResult(
+          pyCall.pickledPayload(), pyCall.tensors(), responseTensorTable);
+      return c10::guts::make_unique<PythonUDFResp>(
+          std::move(payload), std::move(responseTensorTable));
+    }
+    case MessageType::REMOTE_CALL: {
+      auto& src = static_cast<ScriptRemoteCall&>(rpc);
+
+      auto rrefId = RRefId::fromIValue(src.retRRefId());
+      auto forkId = ForkId::fromIValue(src.retForkId());
+      TORCH_CHECK(rrefId != forkId, "Does not support remote call to self.");
+
+      auto& ctx = RRefContext::getInstance();
+      auto ownerRRef = ctx->getOrCreateOwnerRRef<IValue>(rrefId);
+
+      // TODO: make this asynchronous
+      // src is only alive within this block, use reference to avoid copy
+      auto& stack = src.stackRef();
+      src.op()->getOperation()(stack);
+      TORCH_INTERNAL_ASSERT(
+          stack.size() == 1,
+          "Return value of a builtin operator or a "
+          "TorchScript function should be a single IValue, got a vector of "
+          "size ",
+          stack.size());
+
+      ownerRRef->setValue(std::move(stack.front()));
+      return nullptr;
+    }
+    case MessageType::RREF_FETCH_CALL: {
+      auto& srf = static_cast<ScriptRRefFetchCall&>(rpc);
+      // TODO: make this asynchronous
+      std::shared_ptr<OwnerRRef<IValue>> rref =
+          RRefContext::getInstance()->getOrCreateOwnerRRef<IValue>(
+              RRefId::fromIValue(srf.value()));
+      return c10::guts::make_unique<ScriptRRefFetchRet>(rref->getValue());
+    }
+    case MessageType::RREF_USER_CREATE: {
+      auto& sra = static_cast<ScriptRRefCreate&>(rpc);
+      RRefContext::getInstance()->addFork(sra.valueRef());
+      return nullptr;
+    }
+    case MessageType::RREF_USER_DELETE: {
+      auto& srd = static_cast<ScriptRRefDelete&>(rpc);
+      RRefContext::getInstance()->delFork(srd.valueRef());
+      return nullptr;
+    }
+    case MessageType::MESSAGE_WITH_AUTOGRAD_REQ: {
+      auto& rpcWithAutograd = static_cast<RpcWithAutograd&>(rpc);
+      const auto& autogradMetadata = rpcWithAutograd.autogradMetadata();
+
+      // Attach 'recv' autograd function.
+      DistAutogradContext* autogradContext = addRecvRpcBackward(
+          rpcWithAutograd.autogradMetadata(), rpcWithAutograd.tensors());
+
+      // Process the original RPC.
+      auto wrappedMessageType = rpcWithAutograd.wrappedMessageType();
+      auto wrappedRpcResponse =
+          processRpc(rpcWithAutograd.wrappedRpc(), wrappedMessageType);
+
+      // Wrap the response with autograd, need a new autograd message id for
+      // each send/recv pair.
+      auto& autogradContainer = DistAutogradContainer::getInstance();
+      AutogradMetadata responseAutogradMetadata(
+          autogradMetadata.autogradContextId,
+          autogradContainer.newAutogradMessageId());
+
+      auto response = c10::guts::make_unique<RpcWithAutograd>(
+          MessageType::MESSAGE_WITH_AUTOGRAD_RESP,
+          responseAutogradMetadata,
+          std::move(wrappedRpcResponse));
+
+      // Attach the 'send' autograd function if needed.
+      if (autogradContext != nullptr) {
+        addSendRpcBackward(
+            *autogradContext, responseAutogradMetadata, response->tensors());
+      }
+      return std::move(response);
+    }
+    default: {
+      TORCH_INTERNAL_ASSERT(
+          false, "Request type ", messageType, " not supported.");
+    }
+  }
+}
+
+Message RequestCallbackImpl::processMessage(Message& request) const {
+  std::unique_ptr<RpcCommandBase> rpc = deserializeRequest(request);
+  auto response = processRpc(*rpc, request.type());
+  if (response == nullptr) {
+    return Message();
+  }
+  auto responseMessage = std::move(*response).toMessage();
+  responseMessage.setId(request.id());
+  return responseMessage;
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.h b/torch/csrc/distributed/rpc/request_callback_impl.h
new file mode 100644
index 0000000000000..37231761660cd
--- /dev/null
+++ b/torch/csrc/distributed/rpc/request_callback_impl.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+class TORCH_API RequestCallbackImpl : public RequestCallback {
+ public:
+  Message processMessage(Message& request) const override;
+
+ private:
+  std::unique_ptr<RpcCommandBase> processRpc(
+      RpcCommandBase& rpc,
+      MessageType messageType) const;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index 9725513107a5f..4fef3fd9a315f 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -1,15 +1,12 @@
 #include <torch/csrc/distributed/rpc/rpc_agent.h>
-#include <torch/csrc/distributed/autograd/context/dist_autograd_container.h>
-#include <torch/csrc/distributed/autograd/utils.h>
 
 namespace torch {
 namespace distributed {
 namespace rpc {
 
 constexpr size_t WorkerId::MAX_NAME_LEN;
-using namespace torch::distributed::autograd;
 
-RpcAgent::RpcAgent(WorkerId workerId, RequestCallback cb)
+RpcAgent::RpcAgent(WorkerId workerId, std::unique_ptr<RequestCallback> cb)
     : workerId_(std::move(workerId)), cb_(std::move(cb)) {}
 
 RpcAgent::~RpcAgent() = default;
@@ -18,24 +15,6 @@ const WorkerId& RpcAgent::getWorkerId() const {
   return workerId_;
 }
 
-std::shared_ptr<FutureMessage> RpcAgent::send(
-    const WorkerId& to,
-    Message&& message) {
-  // Record appropriate autograd information before sending the message over the
-  // wire.
-  auto& autogradContainer = DistAutogradContainer::getInstance();
-  if (autogradContainer.hasValidContext()) {
-    // Attach the appropriate autograd edges to the tensors found in the
-    // message.
-    auto grad_fn = addSendRpcBackward(message.tensors());
-
-    // Record the send function in our current context.
-    auto& currentContext = autogradContainer.currentContext();
-    currentContext.addSendFunction(grad_fn);
-  }
-
-  return sendImpl(to, std::forward<Message>(message));
-}
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index f5ebda81efa3f..f539a526f1534 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/distributed/rpc/future_message.h>
 #include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/request_callback.h>
 #include <torch/csrc/distributed/rpc/types.h>
 
 #include <algorithm>
@@ -48,17 +49,6 @@ struct WorkerId {
 // will invoke the given ``RequestCallback`` to process received requests. It
 // should immediately become ready to serve request and accept response after
 // construction.
-class RpcAgent;
-
-// RpcAgent implementation should invoke ``RequestCallback`` to process received
-// requests. There is no restriction on the implementation's threading model.
-// This function takes an rvalue reference of the Message object.
-// It is expected to return the response message or message containing an
-// exception. Different rpc agent implementations are expected to ensure
-// delivery of the response/exception based on their implementation specific
-// mechanisms.
-using RequestCallback = std::function<Message(Message&&)>;
-
 class RpcAgent {
  public:
   // `WorkerId` is the globally unique identifier for this RpcAgent instance. It
@@ -71,7 +61,7 @@ class RpcAgent {
   // ``RpcAgent`` base class makes no assumption on the thread-safeness of the
   // ``RequestCallback``. ``RpcAgent`` implementations need to make sure that
   // its threading model conform to ``RequestCallback``'s requirement.
-  RpcAgent(WorkerId id, RequestCallback cb);
+  RpcAgent(WorkerId id, std::unique_ptr<RequestCallback> cb);
 
   virtual ~RpcAgent();
 
@@ -82,7 +72,9 @@ class RpcAgent {
   // If ``message.isRequest()`` is true, the ``FutureMessage`` will be completed
   // when the response arrives. For other message types, the Future should be
   // ignored by the caller.
-  std::shared_ptr<FutureMessage> send(const WorkerId& to, Message&& message);
+  virtual std::shared_ptr<FutureMessage> send(
+      const WorkerId& to,
+      Message&& message) = 0;
 
   // Return a reference to the ``WorkerId`` of this RpcAgent.
   // NB: not using ``c10::optional<const std::string&>`` here because we might
@@ -105,15 +97,8 @@ class RpcAgent {
 
  protected:
   const WorkerId workerId_;
-
-  // Method that needs to be overridden by all implementations of this
-  // interface. The public send() method is responsible for common
-  // pre-processing shared across all implementations.
-  virtual std::shared_ptr<FutureMessage> sendImpl(
-      const WorkerId& to,
-      Message&& message) = 0;
   const std::string workerName_;
-  const RequestCallback cb_;
+  const std::unique_ptr<RequestCallback> cb_;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/rpc_command_base.h b/torch/csrc/distributed/rpc/rpc_command_base.h
new file mode 100644
index 0000000000000..b992624d77f76
--- /dev/null
+++ b/torch/csrc/distributed/rpc/rpc_command_base.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Base class for all RPC request and responses.
+class RpcCommandBase {
+ public:
+  // Need to override this to serialize the RPC. This should destructively
+  // create a message for the RPC (Hence the &&).
+  virtual Message toMessage() && = 0;
+  virtual ~RpcCommandBase() = 0;
+};
+
+inline RpcCommandBase::~RpcCommandBase() {}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/rpc_with_autograd.cpp b/torch/csrc/distributed/rpc/rpc_with_autograd.cpp
new file mode 100644
index 0000000000000..70296d8e868de
--- /dev/null
+++ b/torch/csrc/distributed/rpc/rpc_with_autograd.cpp
@@ -0,0 +1,163 @@
+#include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
+#include <c10/util/C++17.h>
+#include <torch/csrc/distributed/rpc/utils.h>
+#include <torch/csrc/utils/byte_order.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+constexpr int kAutogradMessageSize = 17;
+
+AutogradMetadata::AutogradMetadata(
+    int64_t autogradContextId_,
+    int64_t autogradMessageId_)
+    : autogradContextId(autogradContextId_),
+      autogradMessageId(autogradMessageId_) {}
+
+RpcWithAutograd::RpcWithAutograd(
+    MessageType messageType,
+    const AutogradMetadata& autogradMetadata,
+    std::unique_ptr<RpcCommandBase> wrappedRpc)
+    : messageType_(messageType), autogradMetadata_(autogradMetadata) {
+  TORCH_INTERNAL_ASSERT(wrappedRpc != nullptr, "wrappedRpc cannot be null!");
+  TORCH_INTERNAL_ASSERT(
+      messageType_ == MessageType::MESSAGE_WITH_AUTOGRAD_REQ ||
+      messageType_ == MessageType::MESSAGE_WITH_AUTOGRAD_RESP);
+  wrappedMessage_ = std::move(*wrappedRpc).toMessage();
+  tensors_ = wrappedMessage_.tensors();
+  wrappedMessageType_ = wrappedMessage_.type();
+}
+
+RpcWithAutograd::RpcWithAutograd(
+    MessageType messageType,
+    const AutogradMetadata& autogradMetadata,
+    std::unique_ptr<RpcCommandBase> wrappedRpc,
+    MessageType wrappedMessageType,
+    std::vector<torch::Tensor> tensors)
+    : messageType_(messageType),
+      autogradMetadata_(autogradMetadata),
+      wrappedRpc_(std::move(wrappedRpc)),
+      wrappedMessageType_(wrappedMessageType),
+      tensors_(std::move(tensors)) {
+  TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cannot be null!");
+  TORCH_INTERNAL_ASSERT(
+      messageType_ == MessageType::MESSAGE_WITH_AUTOGRAD_REQ ||
+      messageType_ == MessageType::MESSAGE_WITH_AUTOGRAD_RESP);
+}
+
+Message RpcWithAutograd::toMessage() && {
+  auto messageId = wrappedMessage_.id();
+  auto messageType = wrappedMessage_.type();
+
+  auto payload = std::move(wrappedMessage_).movePayload();
+  TORCH_INTERNAL_ASSERT(!payload.empty());
+
+  // We append the message type (1 byte), autograd context id(8 bytes) and
+  // autograd message id(8 bytes) to the original message in network byte order
+  // (big endian).
+  size_t writableIndex = payload.size();
+
+  // Need 17 additional bytes.
+  payload.resize(payload.size() + kAutogradMessageSize);
+
+  // Add message type.
+  payload[writableIndex++] = messageType;
+
+  // Add autograd ids.
+  torch::utils::THP_encodeInt64Buffer(
+      reinterpret_cast<uint8_t*>(payload.data()) + writableIndex,
+      &autogradMetadata_.autogradContextId,
+      torch::utils::THPByteOrder::THP_BIG_ENDIAN,
+      1);
+  writableIndex += sizeof(int64_t);
+  torch::utils::THP_encodeInt64Buffer(
+      reinterpret_cast<uint8_t*>(payload.data()) + writableIndex,
+      &autogradMetadata_.autogradMessageId,
+      torch::utils::THPByteOrder::THP_BIG_ENDIAN,
+      1);
+
+  return Message(
+      std::move(payload), std::move(tensors_), messageType_, messageId);
+}
+
+std::unique_ptr<RpcWithAutograd> RpcWithAutograd::fromMessage(
+    const Message& message) {
+  MessageType originalMessageType = message.type();
+  TORCH_INTERNAL_ASSERT(
+      MessageType::MESSAGE_WITH_AUTOGRAD_REQ == originalMessageType ||
+      MessageType::MESSAGE_WITH_AUTOGRAD_RESP == originalMessageType);
+
+  std::vector<torch::Tensor> tensors = message.tensors();
+  int64_t messageId = message.id();
+  // Decode message type, autograd context id and autograd message id.
+  auto payload = message.payload();
+  TORCH_INTERNAL_ASSERT(payload.size() > kAutogradMessageSize);
+
+  int64_t autogradContextId, autogradMessageId;
+  // autograd message id.
+  size_t indexToRead = payload.size() - sizeof(int64_t);
+  TORCH_INTERNAL_ASSERT(indexToRead >= 0);
+  torch::utils::THP_decodeInt64Buffer(
+      &autogradMessageId,
+      reinterpret_cast<uint8_t*>(payload.data()) + indexToRead,
+      torch::utils::THPByteOrder::THP_BIG_ENDIAN,
+      1);
+
+  // autograd context id.
+  indexToRead -= sizeof(int64_t);
+  TORCH_INTERNAL_ASSERT(indexToRead >= 0);
+  torch::utils::THP_decodeInt64Buffer(
+      &autogradContextId,
+      reinterpret_cast<uint8_t*>(payload.data()) + indexToRead,
+      torch::utils::THPByteOrder::THP_BIG_ENDIAN,
+      1);
+
+  // message type.
+  indexToRead -= 1;
+  TORCH_INTERNAL_ASSERT(indexToRead >= 0);
+  MessageType wrappedMessageType =
+      static_cast<MessageType>(payload[indexToRead]);
+
+  // Remove the autograd information.
+  payload.resize(payload.size() - kAutogradMessageSize);
+
+  // Create new message type and build wrapped RPC.
+  Message wrappedMessage(
+      std::move(payload), std::move(tensors), wrappedMessageType, messageId);
+
+  std::unique_ptr<RpcCommandBase> wrappedRpc;
+  if (originalMessageType == MessageType::MESSAGE_WITH_AUTOGRAD_REQ) {
+    wrappedRpc = deserializeRequest(wrappedMessage);
+  } else {
+    wrappedRpc = deserializeResponse(wrappedMessage);
+  }
+
+  return c10::guts::make_unique<RpcWithAutograd>(
+      originalMessageType,
+      AutogradMetadata(autogradContextId, autogradMessageId),
+      std::move(wrappedRpc),
+      wrappedMessageType,
+      std::move(tensors));
+}
+
+std::vector<torch::Tensor>& RpcWithAutograd::tensors() {
+  return tensors_;
+}
+
+const AutogradMetadata& RpcWithAutograd::autogradMetadata() const {
+  return autogradMetadata_;
+}
+
+RpcCommandBase& RpcWithAutograd::wrappedRpc() {
+  TORCH_INTERNAL_ASSERT(wrappedRpc_ != nullptr, "wrappedRpc cannot be null!");
+  return *wrappedRpc_;
+}
+
+MessageType RpcWithAutograd::wrappedMessageType() const {
+  return wrappedMessageType_;
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/rpc_with_autograd.h b/torch/csrc/distributed/rpc/rpc_with_autograd.h
new file mode 100644
index 0000000000000..41d3a409a0923
--- /dev/null
+++ b/torch/csrc/distributed/rpc/rpc_with_autograd.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+struct TORCH_API AutogradMetadata {
+  AutogradMetadata(int64_t autogradContextId, int64_t autogradMessageId);
+
+  // autogradContextId_ is a globally unique integer that identifies a
+  // particular distributed autograd pass.
+  int64_t autogradContextId;
+  // autogradMessageId_ is a globally unique integer that identifies a pair
+  // of send/recv autograd functions.
+  int64_t autogradMessageId;
+};
+
+// Represents an RPC that includes autograd information. This class basically
+// wraps another `RpcCommandBase` object which represents the actual RPC and has
+// additional autograd information associated with that RPC.
+class TORCH_API RpcWithAutograd final : public RpcCommandBase {
+ public:
+  // Used when we are sending an RPC over the wire.
+  RpcWithAutograd(
+      MessageType messageType,
+      const AutogradMetadata& autogradMetadata,
+      std::unique_ptr<RpcCommandBase> wrappedRpc);
+
+  // Used when receiving an RPC over the wire.
+  RpcWithAutograd(
+      MessageType messageType,
+      const AutogradMetadata& autogradMetadata,
+      std::unique_ptr<RpcCommandBase> wrappedRpc,
+      MessageType wrappedMessageType,
+      std::vector<torch::Tensor> tensors);
+
+  Message toMessage() && override;
+
+  static std::unique_ptr<RpcWithAutograd> fromMessage(const Message& message);
+
+  // Retrieves tensors as part of this RPC, which need to be considered for
+  // autograd computations.
+  std::vector<torch::Tensor>& tensors();
+
+  const AutogradMetadata& autogradMetadata() const;
+
+  RpcCommandBase& wrappedRpc();
+
+  // Message type of the wrapped RPC.
+  MessageType wrappedMessageType() const;
+
+ private:
+  // Message type for this call.
+  MessageType messageType_;
+
+  AutogradMetadata autogradMetadata_;
+  std::unique_ptr<RpcCommandBase> wrappedRpc_;
+
+  // Serialized message representing wrappedRpc_. Used mostly as a cache to
+  // avoid serializing the request twice.
+  Message wrappedMessage_;
+
+  // message type of the wrappedMessage, this is stored separately since
+  // wrappedMessage_ is not always guaranteed to be populated.
+  MessageType wrappedMessageType_;
+
+  // Tensors part of the wrappedRpc that need to be considered for autograd.
+  std::vector<torch::Tensor> tensors_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/script_call.cpp b/torch/csrc/distributed/rpc/script_call.cpp
index e8ee647e0a4bb..6d9d3eac2312b 100644
--- a/torch/csrc/distributed/rpc/script_call.cpp
+++ b/torch/csrc/distributed/rpc/script_call.cpp
@@ -64,7 +64,7 @@ std::shared_ptr<Operator> ScriptCall::fromIValues(
   }
 }
 
-Message ScriptCall::toMessage() {
+Message ScriptCall::toMessage() && {
   std::vector<IValue> ivalues;
   toIValues(ivalues);
 
@@ -76,7 +76,7 @@ Message ScriptCall::toMessage() {
       std::move(payload), std::move(tensor_table), MessageType::SCRIPT_CALL);
 }
 
-ScriptCall ScriptCall::fromMessage(const Message& message) {
+std::unique_ptr<ScriptCall> ScriptCall::fromMessage(const Message& message) {
   auto payload = static_cast<const char*>(message.payload().data());
   auto payload_size = message.payload().size();
   auto value =
@@ -84,7 +84,7 @@ ScriptCall ScriptCall::fromMessage(const Message& message) {
 
   auto values = value.toTuple()->elements();
   auto op = fromIValues(values);
-  return ScriptCall(op, std::move(values));
+  return c10::guts::make_unique<ScriptCall>(op, std::move(values));
 }
 
 std::shared_ptr<Operator> ScriptCall::matchOperator(
diff --git a/torch/csrc/distributed/rpc/script_call.h b/torch/csrc/distributed/rpc/script_call.h
index 4a38eed754f8e..c2954765102c1 100644
--- a/torch/csrc/distributed/rpc/script_call.h
+++ b/torch/csrc/distributed/rpc/script_call.h
@@ -2,6 +2,7 @@
 
 #include <c10/util/Optional.h>
 #include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/jit/operator.h>
 #include <torch/csrc/jit/pickler.h>
 #include <vector>
@@ -15,7 +16,7 @@ using torch::jit::Operator;
 // A ScriptCall instance represents an invocation of a builtin operator for a
 // TorchScript function (not implemented yet). If it is a builtin operator, it
 // contains a shared ptr to the `Operator` and a list of arguments.
-class TORCH_API ScriptCall {
+class TORCH_API ScriptCall : public RpcCommandBase {
  public:
   ScriptCall(std::shared_ptr<Operator> op, std::vector<at::IValue>&& args);
 
@@ -24,8 +25,8 @@ class TORCH_API ScriptCall {
   const std::vector<at::IValue>& stack() const;
   std::vector<at::IValue>& stackRef();
 
-  Message toMessage();
-  static ScriptCall fromMessage(const Message& message);
+  Message toMessage() && override;
+  static std::unique_ptr<ScriptCall> fromMessage(const Message& message);
 
   virtual ~ScriptCall() = default;
 
diff --git a/torch/csrc/distributed/rpc/script_remote_call.cpp b/torch/csrc/distributed/rpc/script_remote_call.cpp
index 40a8638f19eca..a8d8a7a19dc42 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.cpp
+++ b/torch/csrc/distributed/rpc/script_remote_call.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/distributed/rpc/script_remote_call.h>
+#include <c10/util/C++17.h>
 #include <torch/csrc/jit/pickle.h>
 
 namespace torch {
@@ -22,7 +23,7 @@ const at::IValue& ScriptRemoteCall::retForkId() {
   return retForkId_;
 }
 
-Message ScriptRemoteCall::toMessage() const {
+Message ScriptRemoteCall::toMessage() && {
   std::vector<IValue> ivalues;
   ScriptCall::toIValues(ivalues);
   ivalues.push_back(retRRefId_);
@@ -36,7 +37,8 @@ Message ScriptRemoteCall::toMessage() const {
       std::move(payload), std::move(tensor_table), MessageType::REMOTE_CALL);
 }
 
-ScriptRemoteCall ScriptRemoteCall::fromMessage(const Message& message) {
+std::unique_ptr<ScriptRemoteCall> ScriptRemoteCall::fromMessage(
+    const Message& message) {
   auto payload = static_cast<const char*>(message.payload().data());
   auto payload_size = message.payload().size();
 
@@ -51,7 +53,7 @@ ScriptRemoteCall ScriptRemoteCall::fromMessage(const Message& message) {
   values.pop_back();
 
   auto op = ScriptCall::fromIValues(values);
-  return ScriptRemoteCall(
+  return c10::guts::make_unique<ScriptRemoteCall>(
       op, std::move(values), std::move(retRRefId), std::move(retForkId));
 }
 
diff --git a/torch/csrc/distributed/rpc/script_remote_call.h b/torch/csrc/distributed/rpc/script_remote_call.h
index 0602884f6e40d..e0afb9497ee44 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.h
+++ b/torch/csrc/distributed/rpc/script_remote_call.h
@@ -25,8 +25,8 @@ class TORCH_API ScriptRemoteCall final : public ScriptCall {
   const at::IValue& retRRefId();
   const at::IValue& retForkId();
 
-  Message toMessage() const;
-  static ScriptRemoteCall fromMessage(const Message& message);
+  Message toMessage() && override;
+  static std::unique_ptr<ScriptRemoteCall> fromMessage(const Message& message);
 
  private:
   const at::IValue retRRefId_;
diff --git a/torch/csrc/distributed/rpc/script_ret.cpp b/torch/csrc/distributed/rpc/script_resp.cpp
similarity index 66%
rename from torch/csrc/distributed/rpc/script_ret.cpp
rename to torch/csrc/distributed/rpc/script_resp.cpp
index 07c71518c822c..7165bccd9cf41 100644
--- a/torch/csrc/distributed/rpc/script_ret.cpp
+++ b/torch/csrc/distributed/rpc/script_resp.cpp
@@ -1,4 +1,5 @@
-#include <torch/csrc/distributed/rpc/script_ret.h>
+#include <torch/csrc/distributed/rpc/script_resp.h>
+#include <c10/util/C++17.h>
 #include <torch/csrc/jit/pickle.h>
 #include <torch/csrc/jit/unpickler.h>
 
@@ -13,13 +14,13 @@ using torch::jit::Unpickler;
 
 } // namespace
 
-ScriptRet::ScriptRet(at::IValue&& value) : value_(value) {}
+ScriptResp::ScriptResp(at::IValue&& value) : value_(value) {}
 
-const at::IValue& ScriptRet::value() {
+const at::IValue& ScriptResp::value() {
   return value_;
 }
 
-Message ScriptRet::toMessage() {
+Message ScriptResp::toMessage() && {
   std::vector<torch::Tensor> tensor_table;
   auto payload = jit::pickle(value_, &tensor_table);
   ;
@@ -27,12 +28,12 @@ Message ScriptRet::toMessage() {
       std::move(payload), std::move(tensor_table), MessageType::SCRIPT_RET);
 }
 
-ScriptRet ScriptRet::fromMessage(const Message& message) {
+std::unique_ptr<ScriptResp> ScriptResp::fromMessage(const Message& message) {
   auto payload = static_cast<const char*>(message.payload().data());
   auto payload_size = message.payload().size();
   auto value =
       jit::unpickle(payload, payload_size, nullptr, &message.tensors());
-  return ScriptRet(std::move(value));
+  return c10::guts::make_unique<ScriptResp>(std::move(value));
 }
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/script_ret.h b/torch/csrc/distributed/rpc/script_resp.h
similarity index 57%
rename from torch/csrc/distributed/rpc/script_ret.h
rename to torch/csrc/distributed/rpc/script_resp.h
index 6632b9e589828..5e184cb6232c0 100644
--- a/torch/csrc/distributed/rpc/script_ret.h
+++ b/torch/csrc/distributed/rpc/script_resp.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/jit/pickler.h>
 
 namespace torch {
@@ -8,13 +9,13 @@ namespace distributed {
 namespace rpc {
 
 // Return value of a builtin operator or a TorchScript function.
-class TORCH_API ScriptRet final {
+class TORCH_API ScriptResp final : public RpcCommandBase {
  public:
-  explicit ScriptRet(at::IValue&& values);
+  explicit ScriptResp(at::IValue&& values);
 
   const at::IValue& value();
-  Message toMessage();
-  static ScriptRet fromMessage(const Message& message);
+  Message toMessage() && override;
+  static std::unique_ptr<ScriptResp> fromMessage(const Message& message);
 
  private:
   const at::IValue value_;
diff --git a/torch/csrc/distributed/rpc/script_rref_proto.cpp b/torch/csrc/distributed/rpc/script_rref_proto.cpp
index 4b8be7d7d5518..4e0f55d8d9f1f 100644
--- a/torch/csrc/distributed/rpc/script_rref_proto.cpp
+++ b/torch/csrc/distributed/rpc/script_rref_proto.cpp
@@ -1,4 +1,5 @@
 #include <torch/csrc/distributed/rpc/script_rref_proto.h>
+#include <c10/util/C++17.h>
 #include <torch/csrc/jit/pickle.h>
 
 namespace torch {
@@ -13,7 +14,7 @@ at::IValue& RRefMessageBase::valueRef() {
   return value_;
 }
 
-Message RRefMessageBase::toMessage() const {
+Message RRefMessageBase::toMessage() && {
   std::vector<at::IValue> ivalues;
   ivalues.push_back(value_);
   std::vector<torch::Tensor> tensor_table;
@@ -35,20 +36,26 @@ at::IValue RRefMessageBase::fromMessage(const Message& message) {
   return std::move(values.front());
 }
 
-ScriptRRefFetchCall ScriptRRefFetchCall::fromMessage(const Message& message) {
-  return ScriptRRefFetchCall(RRefMessageBase::fromMessage(message));
+std::unique_ptr<ScriptRRefFetchCall> ScriptRRefFetchCall::fromMessage(
+    const Message& message) {
+  return c10::guts::make_unique<ScriptRRefFetchCall>(
+      RRefMessageBase::fromMessage(message));
 }
 
 ScriptRRefFetchRet ScriptRRefFetchRet::fromMessage(const Message& message) {
   return ScriptRRefFetchRet(RRefMessageBase::fromMessage(message));
 }
 
-ScriptRRefCreate ScriptRRefCreate::fromMessage(const Message& message) {
-  return ScriptRRefCreate(RRefMessageBase::fromMessage(message));
+std::unique_ptr<ScriptRRefCreate> ScriptRRefCreate::fromMessage(
+    const Message& message) {
+  return c10::guts::make_unique<ScriptRRefCreate>(
+      RRefMessageBase::fromMessage(message));
 }
 
-ScriptRRefDelete ScriptRRefDelete::fromMessage(const Message& message) {
-  return ScriptRRefDelete(RRefMessageBase::fromMessage(message));
+std::unique_ptr<ScriptRRefDelete> ScriptRRefDelete::fromMessage(
+    const Message& message) {
+  return c10::guts::make_unique<ScriptRRefDelete>(
+      RRefMessageBase::fromMessage(message));
 }
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/script_rref_proto.h b/torch/csrc/distributed/rpc/script_rref_proto.h
index de35b7e72a251..45d6c5fc42e4a 100644
--- a/torch/csrc/distributed/rpc/script_rref_proto.h
+++ b/torch/csrc/distributed/rpc/script_rref_proto.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
 #include <torch/csrc/jit/operator.h>
 #include <torch/csrc/jit/pickler.h>
 #include <vector>
@@ -11,7 +12,7 @@ namespace rpc {
 
 // Temporary solution of RRef operations.
 // TODO: Remove all these messages and use rpc + registered functions instead.
-class TORCH_API RRefMessageBase {
+class TORCH_API RRefMessageBase : public RpcCommandBase {
  public:
   RRefMessageBase(at::IValue value, MessageType type)
       : value_(std::move(value)), type_(type) {}
@@ -19,7 +20,7 @@ class TORCH_API RRefMessageBase {
   const at::IValue& value();
   at::IValue& valueRef();
 
-  Message toMessage() const;
+  Message toMessage() && override;
   static at::IValue fromMessage(const Message& message);
 
  private:
@@ -34,7 +35,8 @@ class TORCH_API ScriptRRefFetchCall final : public RRefMessageBase {
       : RRefMessageBase(std::move(rrefForkData), MessageType::RREF_FETCH_CALL) {
   }
 
-  static ScriptRRefFetchCall fromMessage(const Message& message);
+  static std::unique_ptr<ScriptRRefFetchCall> fromMessage(
+      const Message& message);
 };
 
 // OwnerRRef uses this message to send the RRef value to a remote UserRRef
@@ -52,7 +54,7 @@ class TORCH_API ScriptRRefCreate final : public RRefMessageBase {
   ScriptRRefCreate(at::IValue value)
       : RRefMessageBase(std::move(value), MessageType::RREF_USER_CREATE) {}
 
-  static ScriptRRefCreate fromMessage(const Message& message);
+  static std::unique_ptr<ScriptRRefCreate> fromMessage(const Message& message);
 };
 
 // UserRRef (regardless of it's the creator or not) uses this message to notify
@@ -62,7 +64,7 @@ class TORCH_API ScriptRRefDelete final : public RRefMessageBase {
   ScriptRRefDelete(at::IValue value)
       : RRefMessageBase(std::move(value), MessageType::RREF_USER_DELETE) {}
 
-  static ScriptRRefDelete fromMessage(const Message& message);
+  static std::unique_ptr<ScriptRRefDelete> fromMessage(const Message& message);
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
new file mode 100644
index 0000000000000..0bf80593b2fd9
--- /dev/null
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -0,0 +1,68 @@
+#include <torch/csrc/distributed/rpc/utils.h>
+#include <torch/csrc/distributed/rpc/python_udf_call.h>
+#include <torch/csrc/distributed/rpc/python_udf_resp.h>
+#include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
+#include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/script_remote_call.h>
+#include <torch/csrc/distributed/rpc/script_resp.h>
+#include <torch/csrc/distributed/rpc/script_rref_proto.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+std::unique_ptr<RpcCommandBase> deserializeRequest(const Message& request) {
+  switch (request.type()) {
+    case MessageType::SCRIPT_CALL: {
+      return ScriptCall::fromMessage(request);
+    }
+    case MessageType::PYTHON_CALL: {
+      return PythonUDFCall::fromMessage(request);
+    }
+    case MessageType::REMOTE_CALL: {
+      return ScriptRemoteCall::fromMessage(request);
+    }
+    case MessageType::RREF_FETCH_CALL: {
+      return ScriptRRefFetchCall::fromMessage(request);
+    }
+    case MessageType::RREF_USER_CREATE: {
+      return ScriptRRefCreate::fromMessage(request);
+    }
+    case MessageType::RREF_USER_DELETE: {
+      return ScriptRRefDelete::fromMessage(request);
+    }
+    case MessageType::MESSAGE_WITH_AUTOGRAD_REQ: {
+      return RpcWithAutograd::fromMessage(request);
+    }
+    default: {
+      TORCH_INTERNAL_ASSERT(
+          false, "Request type ", request.type(), " not supported.");
+    }
+  }
+}
+
+std::unique_ptr<RpcCommandBase> deserializeResponse(const Message& response) {
+  switch (response.type()) {
+    case MessageType::SCRIPT_RET: {
+      return ScriptResp::fromMessage(response);
+    }
+    case MessageType::PYTHON_RET: {
+      return PythonUDFResp::fromMessage(response);
+    }
+    case MessageType::EXCEPTION: {
+      std::string err(response.payload().begin(), response.payload().end());
+      throw std::runtime_error(err);
+    }
+    case MessageType::MESSAGE_WITH_AUTOGRAD_RESP: {
+      return RpcWithAutograd::fromMessage(response);
+    }
+    default: {
+      TORCH_INTERNAL_ASSERT(
+          false, "Response type ", response.type(), " not supported.");
+    }
+  }
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/utils.h b/torch/csrc/distributed/rpc/utils.h
new file mode 100644
index 0000000000000..d028bb3a7fa7c
--- /dev/null
+++ b/torch/csrc/distributed/rpc/utils.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Given an RPC message received as a request over the wire, deserialize it into
+// the appropriate 'RpcCommandBase' type.
+TORCH_API std::unique_ptr<RpcCommandBase> deserializeRequest(
+    const Message& request);
+
+// Given an RPC message received as a response over the wire, deserialize it
+// into the appropriate 'RpcCommandBase' type.
+TORCH_API std::unique_ptr<RpcCommandBase> deserializeResponse(
+    const Message& response);
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/generic/StorageMethods.cpp b/torch/csrc/generic/StorageMethods.cpp
index 0789d59bf75ba..7a6c614816471 100644
--- a/torch/csrc/generic/StorageMethods.cpp
+++ b/torch/csrc/generic/StorageMethods.cpp
@@ -105,13 +105,13 @@ static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyO
   }
 
 #if !(defined(TH_REAL_IS_BYTE) || defined(TH_REAL_IS_CHAR))
-  THPByteOrder byte_order;
+  torch::utils::THPByteOrder byte_order;
   if (strcmp(byte_order_str, "native") == 0) {
-    byte_order = THP_nativeByteOrder();
+    byte_order = torch::utils::THP_nativeByteOrder();
   } else if (strcmp(byte_order_str, "big") == 0) {
-    byte_order = THP_BIG_ENDIAN;
+    byte_order = torch::utils::THP_BIG_ENDIAN;
   } else if (strcmp(byte_order_str, "little") == 0) {
-    byte_order = THP_LITTLE_ENDIAN;
+    byte_order = torch::utils::THP_LITTLE_ENDIAN;
   } else {
     PyErr_Format(PyExc_ValueError,
       "invalid byte_order '%s' (expected 'big', 'little', or 'native')",
@@ -158,22 +158,30 @@ static PyObject * THPStorage_(fromBuffer)(PyObject *_unused, PyObject *args, PyO
   // Because of ASAN checks, that are failing in the THStorage.cpp whenever
   // we are trying to get a value which is not 0 or 1, we have to manually
   // convert original values to boolean ones.
-  THP_decodeBoolBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeBoolBuffer(
+      THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_SHORT)
-  THP_decodeInt16Buffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeInt16Buffer(
+      THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_INT)
-  THP_decodeInt32Buffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeInt32Buffer(
+      THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_LONG)
   // TODO: remove the cast
-  THP_decodeInt64Buffer((int64_t*) THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeInt64Buffer(
+      (int64_t*)THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_HALF)
-  THP_decodeHalfBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeHalfBuffer(
+      THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_BFLOAT16)
-  THP_decodeBFloat16Buffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeBFloat16Buffer(
+      THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_FLOAT)
-  THP_decodeFloatBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeFloatBuffer(
+      THWStorage_(data)(storage), src + offset, byte_order, count);
 #elif defined(TH_REAL_IS_DOUBLE)
-  THP_decodeDoubleBuffer(THWStorage_(data)(storage), src + offset, byte_order, count);
+  torch::utils::THP_decodeDoubleBuffer(
+      THWStorage_(data)(storage), src + offset, byte_order, count);
 #else
 #error "Unknown type"
 #endif
diff --git a/torch/csrc/generic/serialization.cpp b/torch/csrc/generic/serialization.cpp
index 7e2c8685082e6..b8e701fd96a78 100644
--- a/torch/csrc/generic/serialization.cpp
+++ b/torch/csrc/generic/serialization.cpp
@@ -22,15 +22,22 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
   data = (scalar_t*)cpu_data.get();
   THCudaCheck(cudaMemcpy(data, THWStorage_(data)(LIBRARY_STATE self), size * sizeof(scalar_t), cudaMemcpyDeviceToHost));
 #endif
-  if (THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN)
+  if (torch::utils::THP_nativeByteOrder() ==
+      torch::utils::THPByteOrder::THP_LITTLE_ENDIAN)
     doWrite(fd, &size, sizeof(int64_t));
   else {
     int64_t nsize; // convert big endian cpu to little endian storage
-    THP_encodeInt64Buffer((uint8_t*)&nsize, (const int64_t *)&size, THPByteOrder::THP_LITTLE_ENDIAN, 1);
+    torch::utils::THP_encodeInt64Buffer(
+        (uint8_t*)&nsize,
+        (const int64_t*)&size,
+        torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
+        1);
     doWrite(fd, &nsize, sizeof(int64_t));
   }
   // fast track for bytes and little endian
-  if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
+  if (sizeof(scalar_t) == 1 ||
+      torch::utils::THP_nativeByteOrder() ==
+          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
     doWrite(fd, data, sizeof(scalar_t) * size);
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
@@ -38,19 +45,22 @@ void THPStorage_(writeFileRaw)(THWStorage *self, io fd)
     for (int64_t i = 0; i < size; i += buffer_size) {
       size_t to_convert = std::min(size - i, buffer_size);
       if (sizeof(scalar_t) == 2) {
-        THP_encodeInt16Buffer((uint8_t*)le_buffer.get(),
+        torch::utils::THP_encodeInt16Buffer(
+            (uint8_t*)le_buffer.get(),
             (const int16_t*)data + i,
-            THPByteOrder::THP_LITTLE_ENDIAN,
+            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       } else if (sizeof(scalar_t) == 4) {
-        THP_encodeInt32Buffer((uint8_t*)le_buffer.get(),
+        torch::utils::THP_encodeInt32Buffer(
+            (uint8_t*)le_buffer.get(),
             (const int32_t*)data + i,
-            THPByteOrder::THP_LITTLE_ENDIAN,
+            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       } else if (sizeof(scalar_t) == 8) {
-        THP_encodeInt64Buffer((uint8_t*)le_buffer.get(),
+        torch::utils::THP_encodeInt64Buffer(
+            (uint8_t*)le_buffer.get(),
             (const int64_t*)data + i,
-            THPByteOrder::THP_LITTLE_ENDIAN,
+            torch::utils::THPByteOrder::THP_LITTLE_ENDIAN,
             to_convert);
       }
       doWrite(fd, le_buffer.get(), to_convert * sizeof(scalar_t));
@@ -74,10 +84,12 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
   scalar_t *data;
   int64_t size;
   doRead(file, &size, sizeof(int64_t));
-  if (THP_nativeByteOrder() == THPByteOrder::THP_BIG_ENDIAN) {
+  if (torch::utils::THP_nativeByteOrder() ==
+      torch::utils::THPByteOrder::THP_BIG_ENDIAN) {
     int64_t nsize; // convert little endian storage to big endian cpu
     nsize = size;
-    THP_decodeInt64Buffer(&size, (const uint8_t*)&nsize, THP_nativeByteOrder(), 1);
+    torch::utils::THP_decodeInt64Buffer(
+        &size, (const uint8_t*)&nsize, torch::utils::THP_nativeByteOrder(), 1);
   }
   THWStoragePtr storage;
   if (_storage == nullptr) {
@@ -97,7 +109,9 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
 #endif
 
   // fast track for bytes and little endian
-  if (sizeof(scalar_t) == 1 || THP_nativeByteOrder() == THPByteOrder::THP_LITTLE_ENDIAN) {
+  if (sizeof(scalar_t) == 1 ||
+      torch::utils::THP_nativeByteOrder() ==
+          torch::utils::THPByteOrder::THP_LITTLE_ENDIAN) {
     doRead(file, data, sizeof(scalar_t) * THWStorage_(size)(LIBRARY_STATE storage));
   } else {
     int64_t buffer_size = std::min(size, (int64_t)5000);
@@ -109,19 +123,22 @@ THWStorage * THPStorage_(readFileRaw)(io file, THWStorage *_storage)
       doRead(file, le_buffer.get(), sizeof(scalar_t) * to_convert);
 
       if (sizeof(scalar_t) == 2) {
-        THP_decodeInt16Buffer((int16_t*)data + i,
+        torch::utils::THP_decodeInt16Buffer(
+            (int16_t*)data + i,
             le_buffer.get(),
-            THP_nativeByteOrder(),
+            torch::utils::THP_nativeByteOrder(),
             to_convert);
       } else if (sizeof(scalar_t) == 4) {
-        THP_decodeInt32Buffer((int32_t*)data + i,
+        torch::utils::THP_decodeInt32Buffer(
+            (int32_t*)data + i,
             le_buffer.get(),
-            THP_nativeByteOrder(),
+            torch::utils::THP_nativeByteOrder(),
             to_convert);
       } else if (sizeof(scalar_t) == 8) {
-        THP_decodeInt64Buffer((int64_t*)data + i,
+        torch::utils::THP_decodeInt64Buffer(
+            (int64_t*)data + i,
             le_buffer.get(),
-            THP_nativeByteOrder(),
+            torch::utils::THP_nativeByteOrder(),
             to_convert);
       }
     }
diff --git a/torch/csrc/byte_order.cpp b/torch/csrc/utils/byte_order.cpp
similarity index 91%
rename from torch/csrc/byte_order.cpp
rename to torch/csrc/utils/byte_order.cpp
index cf347e6a4e8c3..fbc2763c584a8 100644
--- a/torch/csrc/byte_order.cpp
+++ b/torch/csrc/utils/byte_order.cpp
@@ -1,4 +1,4 @@
-#include <torch/csrc/byte_order.h>
+#include <torch/csrc/utils/byte_order.h>
 #include <c10/util/BFloat16.h>
 #include <cstring>
 
@@ -6,6 +6,8 @@
 #include <stdlib.h>
 #endif
 
+namespace {
+
 static inline void swapBytes16(void *ptr)
 {
   uint16_t output;
@@ -99,6 +101,11 @@ static inline uint64_t decodeUInt64BE(const uint8_t *data) {
   return output;
 }
 
+} // anonymous namespace
+
+namespace torch {
+namespace utils {
+
 THPByteOrder THP_nativeByteOrder()
 {
   uint32_t x = 1;
@@ -108,7 +115,8 @@ THPByteOrder THP_nativeByteOrder()
 void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
 {
   for (size_t i = 0; i < len; i++) {
-    dst[i] = (int16_t) (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+    dst[i] = (int16_t)(
+        order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
     src += sizeof(int16_t);
   }
 }
@@ -116,7 +124,8 @@ void THP_decodeInt16Buffer(int16_t* dst, const uint8_t* src, THPByteOrder order,
 void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
 {
   for (size_t i = 0; i < len; i++) {
-    dst[i] = (int32_t) (order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
+    dst[i] = (int32_t)(
+        order == THP_BIG_ENDIAN ? decodeUInt32BE(src) : decodeUInt32LE(src));
     src += sizeof(int32_t);
   }
 }
@@ -124,7 +133,8 @@ void THP_decodeInt32Buffer(int32_t* dst, const uint8_t* src, THPByteOrder order,
 void THP_decodeInt64Buffer(int64_t* dst, const uint8_t* src, THPByteOrder order, size_t len)
 {
   for (size_t i = 0; i < len; i++) {
-    dst[i] = (int64_t) (order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
+    dst[i] = (int64_t)(
+        order == THP_BIG_ENDIAN ? decodeUInt64BE(src) : decodeUInt64LE(src));
     src += sizeof(int64_t);
   }
 }
@@ -143,7 +153,8 @@ void THP_decodeHalfBuffer(THHalf* dst, const uint8_t* src, THPByteOrder order, s
 void THP_decodeBFloat16Buffer(at::BFloat16* dst, const uint8_t* src, THPByteOrder order, size_t len)
 {
   for (size_t i = 0; i < len; i++) {
-    uint16_t x = (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
+    uint16_t x =
+        (order == THP_BIG_ENDIAN ? decodeUInt16BE(src) : decodeUInt16LE(src));
     std::memcpy(&dst[i], &x, sizeof(dst[i]));
     src += sizeof(uint16_t);
   }
@@ -232,3 +243,6 @@ void THP_encodeDoubleBuffer(uint8_t* dst, const double* src, THPByteOrder order,
     }
   }
 }
+
+} // namespace utils
+} // namespace torch
diff --git a/torch/csrc/utils/byte_order.h b/torch/csrc/utils/byte_order.h
new file mode 100644
index 0000000000000..cc3995c72eeea
--- /dev/null
+++ b/torch/csrc/utils/byte_order.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <THHalf.h>
+#include <c10/util/BFloat16.h>
+#include <torch/csrc/WindowsTorchApiMacro.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace torch {
+namespace utils {
+
+enum THPByteOrder {
+  THP_LITTLE_ENDIAN = 0,
+  THP_BIG_ENDIAN = 1
+};
+
+TORCH_API THPByteOrder THP_nativeByteOrder();
+
+TORCH_API void THP_decodeInt16Buffer(
+    int16_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeInt32Buffer(
+    int32_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeInt64Buffer(
+    int64_t* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeHalfBuffer(
+    THHalf* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeFloatBuffer(
+    float* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeDoubleBuffer(
+    double* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeBoolBuffer(
+    bool* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_decodeBFloat16Buffer(
+    at::BFloat16* dst,
+    const uint8_t* src,
+    THPByteOrder order,
+    size_t len);
+
+TORCH_API void THP_encodeInt16Buffer(
+    uint8_t* dst,
+    const int16_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeInt32Buffer(
+    uint8_t* dst,
+    const int32_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeInt64Buffer(
+    uint8_t* dst,
+    const int64_t* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeFloatBuffer(
+    uint8_t* dst,
+    const float* src,
+    THPByteOrder order,
+    size_t len);
+TORCH_API void THP_encodeDoubleBuffer(
+    uint8_t* dst,
+    const double* src,
+    THPByteOrder order,
+    size_t len);
+
+} // namespace utils
+} // namespace torch

From 17b1faa2bfb0e39842d3529fc1583872682eb235 Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Thu, 3 Oct 2019 08:26:30 -0700
Subject: [PATCH 009/113] Rename jit Function to ScriptFunction

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27219

Test Plan: Imported from OSS

Differential Revision: D17715306

Pulled By: albanD

fbshipit-source-id: d11a7634dbee6a885c7177b240958e5aed2544f3
---
 torch/_jit_internal.py         |  2 +-
 torch/csrc/jit/script/init.cpp |  2 +-
 torch/jit/__init__.py          | 16 ++++++++--------
 torch/onnx/utils.py            |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index 8a8f6fed5f403..af843ec8d17ab 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -630,7 +630,7 @@ def __getitem__(self, types):
 # Retrieves a fully-qualified name (module hierarchy + classname) for a given obj.
 def _qualified_name(obj):
     # short-circuit in cases where the object already has a known qualified name
-    if isinstance(obj, torch._C.Function):
+    if isinstance(obj, torch._C.ScriptFunction):
         return obj.qualified_name
 
     name = obj.__name__
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 01faac859d13b..6c22edbb4adf8 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -685,7 +685,7 @@ void initJitScriptBindings(PyObject* module) {
             cu.define(c10::nullopt, src, pythonResolver(rcb), nullptr);
           });
 
-  py::class_<StrongFunctionPtr>(m, "Function", py::dynamic_attr())
+  py::class_<StrongFunctionPtr>(m, "ScriptFunction", py::dynamic_attr())
       .def(
           "__call__",
           [](py::args args, py::kwargs kwargs) {
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index f69cb858f4afa..ca001c7c5e9cf 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -724,14 +724,14 @@ def trace(func,
           _module_class=None,
           _compilation_unit=_python_cu):
     """
-    Trace a function and return an executable ``ScriptModule`` or ``torch._C.Function``
+    Trace a function and return an executable ``ScriptModule`` or ``torch._C.ScriptFunction``
     that will be optimized using just-in-time compilation.
 
     Using ``torch.jit.trace`` and :func:`torch.jit.trace_module<torch.jit.trace_module>`, you can turn an existing module or Python
-    function into a TorchScript ``torch._C.Function`` or ``ScriptModule``. You must provide example inputs,
+    function into a TorchScript ``torch._C.ScriptFunction`` or ``ScriptModule``. You must provide example inputs,
     and we run the function, recording the operations performed on all the tensors.
 
-    * The resulting recording of a standalone function produces ``torch._C.Function``.
+    * The resulting recording of a standalone function produces ``torch._C.ScriptFunction``.
     * The resulting recording of ``forward`` function of ``nn.Module`` or ``nn.Module`` produces ``ScriptModule``.
 
     This module also contains any parameters that the original
@@ -799,7 +799,7 @@ def trace(func,
         a ``ScriptModule`` object with a single ``forward()`` method containing the traced code.
         The returned ``ScriptModule`` will have the same set of sub-modules and parameters as the
         original ``nn.Module``.
-        If ``callable`` is a standalone function, ``trace`` returns ``torch._C.Function``
+        If ``callable`` is a standalone function, ``trace`` returns ``torch._C.ScriptFunction``
 
     Example (tracing a function):
 
@@ -1080,7 +1080,7 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None):
     r"""
     Scripting a function or ``nn.Module`` will inspect the source code, compile
     it as TorchScript code using the TorchScript compiler, and return a ``ScriptModule`` or
-    ``torch._C.Function``. TorchScript itself is a subset of the Python language, so not all
+    ``torch._C.ScriptFunction``. TorchScript itself is a subset of the Python language, so not all
     features in Python work, but we provide enough functionality to compute on
     tensors and do control-dependent operations. For a complete guide, see the
     `TorchScript Language Reference`_.
@@ -1089,7 +1089,7 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None):
     ``@torch.jit.script`` for `TorchScript Classes <TorchScript Class_>`_ and functions.
 
     **Scripting a function**
-        The ``@torch.jit.script`` decorator will construct a ``torch._C.Function``
+        The ``@torch.jit.script`` decorator will construct a ``torch._C.ScriptFunction``
         by compiling the body of the function.
 
         Example (scripting a function):
@@ -2105,8 +2105,8 @@ def _graph_for(self, *args, **kwargs):
     return last_executed_optimized_graph()
 
 torch._C.ScriptMethod.graph_for = _graph_for
-torch._C.Function.graph_for = _graph_for
-Function = torch._C.Function
+torch._C.ScriptFunction.graph_for = _graph_for
+ScriptFunction = torch._C.ScriptFunction
 
 if not torch._C._jit_init():
     raise RuntimeError("JIT initialization failed")
diff --git a/torch/onnx/utils.py b/torch/onnx/utils.py
index b5091839a2093..3ee099b3e26ff 100644
--- a/torch/onnx/utils.py
+++ b/torch/onnx/utils.py
@@ -238,8 +238,8 @@ def _model_to_graph(model, args, verbose=False, training=False,
                 method_graph, tuple(in_vars), False, propagate)
         except AttributeError:
             raise RuntimeError('\'forward\' method must be a script method')
-    elif isinstance(model, torch.jit.Function):
-        assert example_outputs is not None, "example_outputs must be provided when exporting a TorchScript Function"
+    elif isinstance(model, torch.jit.ScriptFunction):
+        assert example_outputs is not None, "example_outputs must be provided when exporting a TorchScript ScriptFunction"
         method = model
         params = ()
         in_vars, in_desc = torch.jit._flatten(tuple(args))
@@ -261,7 +261,7 @@ def _model_to_graph(model, args, verbose=False, training=False,
                             _disable_torch_constant_prop=_disable_torch_constant_prop,
                             fixed_batch_size=fixed_batch_size)
 
-    if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.Function):
+    if isinstance(model, torch.jit.ScriptModule) or isinstance(model, torch.jit.ScriptFunction):
         out_vars, _ = torch.jit._flatten(tuple(example_outputs))
         graph = _assign_output_shapes(graph, out_vars)
 

From 5b5f398dd43e9cf8c6ab89044e9dcd2e0a26367f Mon Sep 17 00:00:00 2001
From: albanD <desmaison.alban@gmail.com>
Date: Thu, 3 Oct 2019 08:26:30 -0700
Subject: [PATCH 010/113] Make cpp-backed jit classes appear as being in
 torch.jit

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27220

Test Plan: Imported from OSS

Differential Revision: D17715305

Pulled By: albanD

fbshipit-source-id: 574704ad23ece6da7aa2780b78867307bef523cc
---
 test/test_jit.py        |  4 ++--
 torch/_jit_internal.py  |  2 +-
 torch/jit/__init__.py   | 19 +++++++++++++------
 torch/utils/__init__.py |  6 ++++++
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index a9903e5c37ab4..6757eb394b343 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -4002,7 +4002,7 @@ def forward(self):
 
         _, lineno = inspect.getsourcelines(FooTest2)
 
-        with self.assertRaisesRegex(torch._C.JITException, 'test_jit.py:{}'.format(lineno + 3)):
+        with self.assertRaisesRegex(torch.jit.Error, 'test_jit.py:{}'.format(lineno + 3)):
             ft = FooTest2()
             loaded = self.getExportImportCopy(ft)
             loaded()
@@ -14636,7 +14636,7 @@ def forward(self, x, use_ignore_path):
         buffer.seek(0)
         loaded = torch.jit.load(buffer)
 
-        with self.assertRaisesRegex(torch._C.JITException, "annotated to be ignored and cannot be run"):
+        with self.assertRaisesRegex(torch.jit.Error, "annotated to be ignored and cannot be run"):
             loaded(torch.tensor(.5), True)
 
     def test_module_error(self):
diff --git a/torch/_jit_internal.py b/torch/_jit_internal.py
index af843ec8d17ab..90359cef759e8 100644
--- a/torch/_jit_internal.py
+++ b/torch/_jit_internal.py
@@ -630,7 +630,7 @@ def __getitem__(self, types):
 # Retrieves a fully-qualified name (module hierarchy + classname) for a given obj.
 def _qualified_name(obj):
     # short-circuit in cases where the object already has a known qualified name
-    if isinstance(obj, torch._C.ScriptFunction):
+    if isinstance(obj, torch.jit.ScriptFunction):
         return obj.qualified_name
 
     name = obj.__name__
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index ca001c7c5e9cf..2a47205e1a3bf 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -14,6 +14,7 @@
 from torch._six import PY2, PY37, with_metaclass, string_classes
 from ..nn.modules.utils import _single, _pair, _triple, _quadruple, \
     _list_with_default
+from torch.utils import set_module
 
 import collections
 import contextlib
@@ -65,6 +66,7 @@ def _parse_env(name, default, true_message, false_message):
 _python_cu = torch._C.CompilationUnit()
 
 Future = torch._C.Future
+set_module(Future, "torch.jit")
 _fork = torch._C.fork
 _wait = torch._C.wait
 
@@ -724,14 +726,14 @@ def trace(func,
           _module_class=None,
           _compilation_unit=_python_cu):
     """
-    Trace a function and return an executable ``ScriptModule`` or ``torch._C.ScriptFunction``
+    Trace a function and return an executable ``ScriptModule`` or ``torch.jit.ScriptFunction``
     that will be optimized using just-in-time compilation.
 
     Using ``torch.jit.trace`` and :func:`torch.jit.trace_module<torch.jit.trace_module>`, you can turn an existing module or Python
-    function into a TorchScript ``torch._C.ScriptFunction`` or ``ScriptModule``. You must provide example inputs,
+    function into a TorchScript ``torch.jit.ScriptFunction`` or ``ScriptModule``. You must provide example inputs,
     and we run the function, recording the operations performed on all the tensors.
 
-    * The resulting recording of a standalone function produces ``torch._C.ScriptFunction``.
+    * The resulting recording of a standalone function produces ``torch.jit.ScriptFunction``.
     * The resulting recording of ``forward`` function of ``nn.Module`` or ``nn.Module`` produces ``ScriptModule``.
 
     This module also contains any parameters that the original
@@ -799,7 +801,7 @@ def trace(func,
         a ``ScriptModule`` object with a single ``forward()`` method containing the traced code.
         The returned ``ScriptModule`` will have the same set of sub-modules and parameters as the
         original ``nn.Module``.
-        If ``callable`` is a standalone function, ``trace`` returns ``torch._C.ScriptFunction``
+        If ``callable`` is a standalone function, ``trace`` returns ``torch.jit.ScriptFunction``
 
     Example (tracing a function):
 
@@ -1080,7 +1082,7 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None):
     r"""
     Scripting a function or ``nn.Module`` will inspect the source code, compile
     it as TorchScript code using the TorchScript compiler, and return a ``ScriptModule`` or
-    ``torch._C.ScriptFunction``. TorchScript itself is a subset of the Python language, so not all
+    ``torch.jit.ScriptFunction``. TorchScript itself is a subset of the Python language, so not all
     features in Python work, but we provide enough functionality to compute on
     tensors and do control-dependent operations. For a complete guide, see the
     `TorchScript Language Reference`_.
@@ -1089,7 +1091,7 @@ def script(obj, optimize=None, _frames_up=0, _rcb=None):
     ``@torch.jit.script`` for `TorchScript Classes <TorchScript Class_>`_ and functions.
 
     **Scripting a function**
-        The ``@torch.jit.script`` decorator will construct a ``torch._C.ScriptFunction``
+        The ``@torch.jit.script`` decorator will construct a ``torch.jit.ScriptFunction``
         by compiling the body of the function.
 
         Example (scripting a function):
@@ -2058,6 +2060,10 @@ def _check_directly_compile_overloaded(obj):
 
 # torch.jit.Error
 Error = torch._C.JITException
+set_module(Error, "torch.jit")
+# This is not perfect but works in common cases
+Error.__name__ = "Error"
+Error.__qualname__ = "Error"
 
 def _get_named_tuple_properties(obj):
     assert issubclass(obj, tuple) and hasattr(obj, '_fields')
@@ -2107,6 +2113,7 @@ def _graph_for(self, *args, **kwargs):
 torch._C.ScriptMethod.graph_for = _graph_for
 torch._C.ScriptFunction.graph_for = _graph_for
 ScriptFunction = torch._C.ScriptFunction
+set_module(ScriptFunction, "torch.jit")
 
 if not torch._C._jit_init():
     raise RuntimeError("JIT initialization failed")
diff --git a/torch/utils/__init__.py b/torch/utils/__init__.py
index 14ce713dfccfa..8912b870e6923 100644
--- a/torch/utils/__init__.py
+++ b/torch/utils/__init__.py
@@ -1,3 +1,9 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 from .throughput_benchmark import ThroughputBenchmark  # noqa: F401
+
+# Set the module for a given object for nicer printing
+def set_module(obj, mod):
+    if not isinstance(mod, str):
+        raise TypeError("The mod argument should be a string")
+    obj.__module__ = mod

From 8fbefa06f6a863503118ebaa452ec84ed862fb31 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Thu, 3 Oct 2019 08:27:23 -0700
Subject: [PATCH 011/113] Avoid configuring ROCm if USE_CUDA is on. (#26910)

Summary:
Move the resolution of conflict between `USE_CUDA` and `USE_ROCM` to CMake as to effectuate:

- `USE_CUDA=ON` and CUDA is found, `USE_ROCM=ON` and ROCM is found --> fatal error
- Either `USE_CUDA=ON` and CUDA is found or `USE_ROCM=ON` and ROCM is found --> The respective GPU feature is ON
- Otherwise no GPU support
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26910

Differential Revision: D17738652

Pulled By: ezyang

fbshipit-source-id: 8e07cc7e922e0abda24a6518119c28952276064e
---
 aten/CMakeLists.txt         | 8 ++++++--
 tools/setup_helpers/cuda.py | 4 ++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt
index 77ed7da438129..944491cf21ea7 100644
--- a/aten/CMakeLists.txt
+++ b/aten/CMakeLists.txt
@@ -55,11 +55,15 @@ add_subdirectory(src/THNN)
 IF(USE_ROCM)
   include(LoadHIP)
   if (NOT PYTORCH_FOUND_HIP)
-    MESSAGE(FATAL_ERROR
-      "Could not find HIP installation")
+    set(USE_ROCM OFF)
   endif()
 ENDIF()
 
+# Both CUDA and ROCM are enabled and found. Report an error.
+if(USE_CUDA AND USE_ROCM)
+  message(FATAL_ERROR "Both CUDA and ROCm are enabled and found. PyTorch can only be built with either of them. Please turn one off by using either USE_CUDA=OFF or USE_ROCM=OFF.")
+endif()
+
 IF(MSVC)
   # we want to respect the standard, and we are bored of those **** .
   ADD_DEFINITIONS(-D_CRT_SECURE_NO_DEPRECATE=1)
diff --git a/tools/setup_helpers/cuda.py b/tools/setup_helpers/cuda.py
index 25d5e3e46b339..3a1a27712e6e8 100644
--- a/tools/setup_helpers/cuda.py
+++ b/tools/setup_helpers/cuda.py
@@ -5,7 +5,7 @@
 from subprocess import Popen, PIPE
 
 from . import which
-from .env import IS_WINDOWS, IS_LINUX, IS_DARWIN, check_env_flag, check_negative_env_flag
+from .env import IS_WINDOWS, IS_LINUX, IS_DARWIN, check_negative_env_flag
 
 LINUX_HOME = '/usr/local/cuda'
 WINDOWS_HOME = glob.glob('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
@@ -53,7 +53,7 @@ def find_cuda_version(cuda_home):
     if len(candidates) > 0:
         return candidates[0]
 
-if check_negative_env_flag('USE_CUDA') or check_env_flag('USE_ROCM'):
+if check_negative_env_flag('USE_CUDA'):
     USE_CUDA = False
     CUDA_HOME = None
     CUDA_VERSION = None

From 7e95b899801bf27409b77107aee41856eab3fee9 Mon Sep 17 00:00:00 2001
From: Will Feng <willfeng@fb.com>
Date: Thu, 3 Oct 2019 09:31:59 -0700
Subject: [PATCH 012/113] Revert "Add std::variant backport as c10::variant
 (#26836)" (#27277)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
This reverts commit 0cd188035a27fc38ce1e8eee205f6d47cd7650e6.

As reported by jerryzh168 and pritamdamania87, mpark::variant doesn’t compile with gcc 7.3.1 on fb devserver and throws error similar to https://github.com/mpark/variant/issues/43. (However, it doesn’t fail with gcc 7.3.1 in OSS CI, based on https://circleci.com/api/v1.1/project/github/pytorch/pytorch/2995606/output/107/0?file=true)
A plausible workaround is to upgrade devserver to devtoolset-8, but that would in turn causes CUDA build to complain:
```
/usr/local/cuda/bin/../targets/x86_64-linux/include/crt/host_config.h:119:2: error: #error -- unsupported GNU version! gcc versions later than 7 are not supported!
 #error -- unsupported GNU version! gcc versions later than 7 are not supported!
```
(Thanks pritamdamania87 for the report!)

The solution for now is to revert the mpark::variant addition, and I will find alternatives that will work with gcc 7.3.1 on fb devserver.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27277

Differential Revision: D17739804

fbshipit-source-id: ad945b3d86ab7ddbff58f4ecab95e0e1ac725ae9
---
 aten/src/ATen/test/CMakeLists.txt   |    3 +-
 aten/src/ATen/test/variant_test.cpp |   41 -
 c10/util/variant.h                  | 2822 ---------------------------
 3 files changed, 1 insertion(+), 2865 deletions(-)
 delete mode 100644 aten/src/ATen/test/variant_test.cpp
 delete mode 100644 c10/util/variant.h

diff --git a/aten/src/ATen/test/CMakeLists.txt b/aten/src/ATen/test/CMakeLists.txt
index 0455ce9cd8035..80fdf4fc0f7ef 100644
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@@ -29,8 +29,7 @@ list(APPEND ATen_CPU_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/xla_tensor_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/tensor_iterator_test.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/cpu_generator_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/variant_test.cpp)
+  ${CMAKE_CURRENT_SOURCE_DIR}/pow_test.cpp)
 
 list(APPEND ATen_CUDA_TEST_SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/cuda_integer_divider_test.cu
diff --git a/aten/src/ATen/test/variant_test.cpp b/aten/src/ATen/test/variant_test.cpp
deleted file mode 100644
index da741e15bb08a..0000000000000
--- a/aten/src/ATen/test/variant_test.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <c10/util/variant.h>
-
-namespace testns {
-
-namespace enumtype {
-  // NOTE: We need to provide the default constructor for each struct,
-  // otherwise Clang 3.8 would complain:
-  // ```
-  // error: default initialization of an object of const type 'const enumtype::Enum1'
-  // without a user-provided default constructor
-  // ```
-  struct Enum1 { Enum1() {} };
-  struct Enum2 { Enum2() {} };
-  struct Enum3 { Enum3() {} };
-} // namespace enumtype
-
-const enumtype::Enum1 kEnum1;
-const enumtype::Enum2 kEnum2;
-const enumtype::Enum3 kEnum3;
-
-} // namespace testns
-
-std::string func(c10::variant<testns::enumtype::Enum1, testns::enumtype::Enum2, testns::enumtype::Enum3> v) {
-  if (c10::get_if<testns::enumtype::Enum1>(&v)) {
-    return "Enum1";
-  } else if (c10::get_if<testns::enumtype::Enum2>(&v)) {
-    return "Enum2";
-  } else if (c10::get_if<testns::enumtype::Enum3>(&v)) {
-    return "Enum3";
-  } else {
-    return "Unsupported enum";
-  }
-}
-
-TEST(VariantTest, Basic) {
-  ASSERT_EQ(func(testns::kEnum1), "Enum1");
-  ASSERT_EQ(func(testns::kEnum2), "Enum2");
-  ASSERT_EQ(func(testns::kEnum3), "Enum3");
-}
diff --git a/c10/util/variant.h b/c10/util/variant.h
deleted file mode 100644
index b183641fe3415..0000000000000
--- a/c10/util/variant.h
+++ /dev/null
@@ -1,2822 +0,0 @@
-// MPark.Variant
-//
-// Copyright Michael Park, 2015-2017
-//
-// Distributed under the Boost Software License, Version 1.0.
-// (See accompanying file LICENSE.md or copy at http://boost.org/LICENSE_1_0.txt)
-//
-// From https://github.com/mpark/variant
-//
-// C10
-// - Move to `c10` namespace.
-// - Rename namespace `detail` to `detail_`, to not conflict with existing
-//   c10 implementations in `detail` namespace.
-// - `struct in_place_t` is renamed to `struct variant_in_place_t`, to not
-//   conflict with `struct in_place_t` in c10/util/Optional.h.
-
-#ifndef C10_UTIL_VARIANT_H_
-#define C10_UTIL_VARIANT_H_
-
-/*
-   variant synopsis
-
-namespace std {
-
-  // 20.7.2, class template variant
-  template <class... Types>
-  class variant {
-  public:
-
-    // 20.7.2.1, constructors
-    constexpr variant() noexcept(see below);
-    variant(const variant&);
-    variant(variant&&) noexcept(see below);
-
-    template <class T> constexpr variant(T&&) noexcept(see below);
-
-    template <class T, class... Args>
-    constexpr explicit variant(in_place_type_t<T>, Args&&...);
-
-    template <class T, class U, class... Args>
-    constexpr explicit variant(
-        in_place_type_t<T>, initializer_list<U>, Args&&...);
-
-    template <size_t I, class... Args>
-    constexpr explicit variant(in_place_index_t<I>, Args&&...);
-
-    template <size_t I, class U, class... Args>
-    constexpr explicit variant(
-        in_place_index_t<I>, initializer_list<U>, Args&&...);
-
-    // 20.7.2.2, destructor
-    ~variant();
-
-    // 20.7.2.3, assignment
-    variant& operator=(const variant&);
-    variant& operator=(variant&&) noexcept(see below);
-
-    template <class T> variant& operator=(T&&) noexcept(see below);
-
-    // 20.7.2.4, modifiers
-    template <class T, class... Args>
-    T& emplace(Args&&...);
-
-    template <class T, class U, class... Args>
-    T& emplace(initializer_list<U>, Args&&...);
-
-    template <size_t I, class... Args>
-    variant_alternative<I, variant>& emplace(Args&&...);
-
-    template <size_t I, class U, class...  Args>
-    variant_alternative<I, variant>& emplace(initializer_list<U>, Args&&...);
-
-    // 20.7.2.5, value status
-    constexpr bool valueless_by_exception() const noexcept;
-    constexpr size_t index() const noexcept;
-
-    // 20.7.2.6, swap
-    void swap(variant&) noexcept(see below);
-  };
-
-  // 20.7.3, variant helper classes
-  template <class T> struct variant_size; // undefined
-
-  template <class T>
-  constexpr size_t variant_size_v = variant_size<T>::value;
-
-  template <class T> struct variant_size<const T>;
-  template <class T> struct variant_size<volatile T>;
-  template <class T> struct variant_size<const volatile T>;
-
-  template <class... Types>
-  struct variant_size<variant<Types...>>;
-
-  template <size_t I, class T> struct variant_alternative; // undefined
-
-  template <size_t I, class T>
-  using variant_alternative_t = typename variant_alternative<I, T>::type;
-
-  template <size_t I, class T> struct variant_alternative<I, const T>;
-  template <size_t I, class T> struct variant_alternative<I, volatile T>;
-  template <size_t I, class T> struct variant_alternative<I, const volatile T>;
-
-  template <size_t I, class... Types>
-  struct variant_alternative<I, variant<Types...>>;
-
-  constexpr size_t variant_npos = -1;
-
-  // 20.7.4, value access
-  template <class T, class... Types>
-  constexpr bool holds_alternative(const variant<Types...>&) noexcept;
-
-  template <size_t I, class... Types>
-  constexpr variant_alternative_t<I, variant<Types...>>&
-  get(variant<Types...>&);
-
-  template <size_t I, class... Types>
-  constexpr variant_alternative_t<I, variant<Types...>>&&
-  get(variant<Types...>&&);
-
-  template <size_t I, class... Types>
-  constexpr variant_alternative_t<I, variant<Types...>> const&
-  get(const variant<Types...>&);
-
-  template <size_t I, class... Types>
-  constexpr variant_alternative_t<I, variant<Types...>> const&&
-  get(const variant<Types...>&&);
-
-  template <class T, class...  Types>
-  constexpr T& get(variant<Types...>&);
-
-  template <class T, class... Types>
-  constexpr T&& get(variant<Types...>&&);
-
-  template <class T, class... Types>
-  constexpr const T& get(const variant<Types...>&);
-
-  template <class T, class... Types>
-  constexpr const T&& get(const variant<Types...>&&);
-
-  template <size_t I, class... Types>
-  constexpr add_pointer_t<variant_alternative_t<I, variant<Types...>>>
-  get_if(variant<Types...>*) noexcept;
-
-  template <size_t I, class... Types>
-  constexpr add_pointer_t<const variant_alternative_t<I, variant<Types...>>>
-  get_if(const variant<Types...>*) noexcept;
-
-  template <class T, class... Types>
-  constexpr add_pointer_t<T>
-  get_if(variant<Types...>*) noexcept;
-
-  template <class T, class... Types>
-  constexpr add_pointer_t<const T>
-  get_if(const variant<Types...>*) noexcept;
-
-  // 20.7.5, relational operators
-  template <class... Types>
-  constexpr bool operator==(const variant<Types...>&, const variant<Types...>&);
-
-  template <class... Types>
-  constexpr bool operator!=(const variant<Types...>&, const variant<Types...>&);
-
-  template <class... Types>
-  constexpr bool operator<(const variant<Types...>&, const variant<Types...>&);
-
-  template <class... Types>
-  constexpr bool operator>(const variant<Types...>&, const variant<Types...>&);
-
-  template <class... Types>
-  constexpr bool operator<=(const variant<Types...>&, const variant<Types...>&);
-
-  template <class... Types>
-  constexpr bool operator>=(const variant<Types...>&, const variant<Types...>&);
-
-  // 20.7.6, visitation
-  template <class Visitor, class... Variants>
-  constexpr see below visit(Visitor&&, Variants&&...);
-
-  // 20.7.7, class monostate
-  struct monostate;
-
-  // 20.7.8, monostate relational operators
-  constexpr bool operator<(monostate, monostate) noexcept;
-  constexpr bool operator>(monostate, monostate) noexcept;
-  constexpr bool operator<=(monostate, monostate) noexcept;
-  constexpr bool operator>=(monostate, monostate) noexcept;
-  constexpr bool operator==(monostate, monostate) noexcept;
-  constexpr bool operator!=(monostate, monostate) noexcept;
-
-  // 20.7.9, specialized algorithms
-  template <class... Types>
-  void swap(variant<Types...>&, variant<Types...>&) noexcept(see below);
-
-  // 20.7.10, class bad_variant_access
-  class bad_variant_access;
-
-  // 20.7.11, hash support
-  template <class T> struct hash;
-  template <class... Types> struct hash<variant<Types...>>;
-  template <> struct hash<monostate>;
-
-} // namespace std
-
-*/
-
-#include <cstddef>
-#include <exception>
-#include <functional>
-#include <initializer_list>
-#include <new>
-#include <type_traits>
-#include <utility>
-
-// MPark.Variant
-//
-// Copyright Michael Park, 2015-2017
-//
-// Distributed under the Boost Software License, Version 1.0.
-// (See accompanying file LICENSE.md or copy at http://boost.org/LICENSE_1_0.txt)
-
-#ifndef MPARK_CONFIG_HPP
-#define MPARK_CONFIG_HPP
-
-// MSVC 2015 Update 3.
-#if __cplusplus < 201103L && (!defined(_MSC_VER) || _MSC_FULL_VER < 190024210)
-#error "MPark.Variant requires C++11 support."
-#endif
-
-#ifndef __has_attribute
-#define __has_attribute(x) 0
-#endif
-
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
-#ifndef __has_include
-#define __has_include(x) 0
-#endif
-
-#ifndef __has_feature
-#define __has_feature(x) 0
-#endif
-
-#if __has_attribute(always_inline) || defined(__GNUC__)
-#define MPARK_ALWAYS_INLINE __attribute__((__always_inline__)) inline
-#elif defined(_MSC_VER)
-#define MPARK_ALWAYS_INLINE __forceinline
-#else
-#define MPARK_ALWAYS_INLINE inline
-#endif
-
-#if __has_builtin(__builtin_addressof) || \
-    (defined(__GNUC__) && __GNUC__ >= 7) || defined(_MSC_VER)
-#define MPARK_BUILTIN_ADDRESSOF
-#endif
-
-#if __has_builtin(__builtin_unreachable) || defined(__GNUC__)
-#define MPARK_BUILTIN_UNREACHABLE __builtin_unreachable()
-#elif defined(_MSC_VER)
-#define MPARK_BUILTIN_UNREACHABLE __assume(false)
-#else
-#define MPARK_BUILTIN_UNREACHABLE
-#endif
-
-#if __has_builtin(__type_pack_element)
-#define MPARK_TYPE_PACK_ELEMENT
-#endif
-
-#if defined(__cpp_constexpr) && __cpp_constexpr >= 200704 && \
-    !(defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ == 9)
-#define MPARK_CPP11_CONSTEXPR
-#endif
-
-#if defined(__cpp_constexpr) && __cpp_constexpr >= 201304
-#define MPARK_CPP14_CONSTEXPR
-#endif
-
-#if __has_feature(cxx_exceptions) || defined(__cpp_exceptions) || \
-    (defined(_MSC_VER) && defined(_CPPUNWIND))
-#define MPARK_EXCEPTIONS
-#endif
-
-#if defined(__cpp_generic_lambdas) || defined(_MSC_VER)
-#define MPARK_GENERIC_LAMBDAS
-#endif
-
-#if defined(__cpp_lib_integer_sequence)
-#define MPARK_INTEGER_SEQUENCE
-#endif
-
-#if defined(__cpp_return_type_deduction) || defined(_MSC_VER)
-#define MPARK_RETURN_TYPE_DEDUCTION
-#endif
-
-#if defined(__cpp_lib_transparent_operators) || defined(_MSC_VER)
-#define MPARK_TRANSPARENT_OPERATORS
-#endif
-
-#if defined(__cpp_variable_templates) || defined(_MSC_VER)
-#define MPARK_VARIABLE_TEMPLATES
-#endif
-
-#if !defined(__GLIBCXX__) || __has_include(<codecvt>)  // >= libstdc++-5
-#define MPARK_TRIVIALITY_TYPE_TRAITS
-#define MPARK_INCOMPLETE_TYPE_TRAITS
-#endif
-
-#endif  // MPARK_CONFIG_HPP
-
-// MPark.Variant
-//
-// Copyright Michael Park, 2015-2017
-//
-// Distributed under the Boost Software License, Version 1.0.
-// (See accompanying file LICENSE.md or copy at http://boost.org/LICENSE_1_0.txt)
-
-#ifndef MPARK_IN_PLACE_HPP
-#define MPARK_IN_PLACE_HPP
-
-#include <cstddef>
-
-
-namespace c10 {
-
-  struct variant_in_place_t { explicit variant_in_place_t() = default; };
-
-  template <std::size_t I>
-  struct in_place_index_t { explicit in_place_index_t() = default; };
-
-  template <typename T>
-  struct in_place_type_t { explicit in_place_type_t() = default; };
-
-#ifdef MPARK_VARIABLE_TEMPLATES
-  constexpr variant_in_place_t in_place{};
-
-  template <std::size_t I> constexpr in_place_index_t<I> in_place_index{};
-
-  template <typename T> constexpr in_place_type_t<T> in_place_type{};
-#endif
-
-}  // namespace c10
-
-#endif  // MPARK_IN_PLACE_HPP
-
-// MPark.Variant
-//
-// Copyright Michael Park, 2015-2017
-//
-// Distributed under the Boost Software License, Version 1.0.
-// (See accompanying file LICENSE.md or copy at http://boost.org/LICENSE_1_0.txt)
-
-#ifndef MPARK_LIB_HPP
-#define MPARK_LIB_HPP
-
-#include <memory>
-#include <functional>
-#include <type_traits>
-#include <utility>
-
-
-#define MPARK_RETURN(...) \
-  noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) { return __VA_ARGS__; }
-
-namespace c10 {
-  namespace lib {
-    template <typename T>
-    struct identity { using type = T; };
-
-    inline namespace cpp14 {
-      template <typename T, std::size_t N>
-      struct array {
-        constexpr const T &operator[](std::size_t index) const {
-          return data[index];
-        }
-
-        T data[N == 0 ? 1 : N];
-      };
-
-      template <typename T>
-      using add_pointer_t = typename std::add_pointer<T>::type;
-
-      template <typename... Ts>
-      using common_type_t = typename std::common_type<Ts...>::type;
-
-      template <typename T>
-      using decay_t = typename std::decay<T>::type;
-
-      template <bool B, typename T = void>
-      using enable_if_t = typename std::enable_if<B, T>::type;
-
-      template <typename T>
-      using remove_const_t = typename std::remove_const<T>::type;
-
-      template <typename T>
-      using remove_reference_t = typename std::remove_reference<T>::type;
-
-      template <typename T>
-      inline constexpr T &&forward(remove_reference_t<T> &t) noexcept {
-        return static_cast<T &&>(t);
-      }
-
-      template <typename T>
-      inline constexpr T &&forward(remove_reference_t<T> &&t) noexcept {
-        static_assert(!std::is_lvalue_reference<T>::value,
-                      "can not forward an rvalue as an lvalue");
-        return static_cast<T &&>(t);
-      }
-
-      template <typename T>
-      inline constexpr remove_reference_t<T> &&move(T &&t) noexcept {
-        return static_cast<remove_reference_t<T> &&>(t);
-      }
-
-#ifdef MPARK_INTEGER_SEQUENCE
-      using std::integer_sequence;
-      using std::index_sequence;
-      using std::make_index_sequence;
-      using std::index_sequence_for;
-#else
-      template <typename T, T... Is>
-      struct integer_sequence {
-        using value_type = T;
-        static constexpr std::size_t size() noexcept { return sizeof...(Is); }
-      };
-
-      template <std::size_t... Is>
-      using index_sequence = integer_sequence<std::size_t, Is...>;
-
-      template <typename Lhs, typename Rhs>
-      struct make_index_sequence_concat;
-
-      template <std::size_t... Lhs, std::size_t... Rhs>
-      struct make_index_sequence_concat<index_sequence<Lhs...>,
-                                        index_sequence<Rhs...>>
-          : identity<index_sequence<Lhs..., (sizeof...(Lhs) + Rhs)...>> {};
-
-      template <std::size_t N>
-      struct make_index_sequence_impl;
-
-      template <std::size_t N>
-      using make_index_sequence = typename make_index_sequence_impl<N>::type;
-
-      template <std::size_t N>
-      struct make_index_sequence_impl
-          : make_index_sequence_concat<make_index_sequence<N / 2>,
-                                       make_index_sequence<N - (N / 2)>> {};
-
-      template <>
-      struct make_index_sequence_impl<0> : identity<index_sequence<>> {};
-
-      template <>
-      struct make_index_sequence_impl<1> : identity<index_sequence<0>> {};
-
-      template <typename... Ts>
-      using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
-#endif
-
-      // <functional>
-#ifdef MPARK_TRANSPARENT_OPERATORS
-      using equal_to = std::equal_to<>;
-#else
-      struct equal_to {
-        template <typename Lhs, typename Rhs>
-        inline constexpr auto operator()(Lhs &&lhs, Rhs &&rhs) const
-          MPARK_RETURN(lib::forward<Lhs>(lhs) == lib::forward<Rhs>(rhs))
-      };
-#endif
-
-#ifdef MPARK_TRANSPARENT_OPERATORS
-      using not_equal_to = std::not_equal_to<>;
-#else
-      struct not_equal_to {
-        template <typename Lhs, typename Rhs>
-        inline constexpr auto operator()(Lhs &&lhs, Rhs &&rhs) const
-          MPARK_RETURN(lib::forward<Lhs>(lhs) != lib::forward<Rhs>(rhs))
-      };
-#endif
-
-#ifdef MPARK_TRANSPARENT_OPERATORS
-      using less = std::less<>;
-#else
-      struct less {
-        template <typename Lhs, typename Rhs>
-        inline constexpr auto operator()(Lhs &&lhs, Rhs &&rhs) const
-          MPARK_RETURN(lib::forward<Lhs>(lhs) < lib::forward<Rhs>(rhs))
-      };
-#endif
-
-#ifdef MPARK_TRANSPARENT_OPERATORS
-      using greater = std::greater<>;
-#else
-      struct greater {
-        template <typename Lhs, typename Rhs>
-        inline constexpr auto operator()(Lhs &&lhs, Rhs &&rhs) const
-          MPARK_RETURN(lib::forward<Lhs>(lhs) > lib::forward<Rhs>(rhs))
-      };
-#endif
-
-#ifdef MPARK_TRANSPARENT_OPERATORS
-      using less_equal = std::less_equal<>;
-#else
-      struct less_equal {
-        template <typename Lhs, typename Rhs>
-        inline constexpr auto operator()(Lhs &&lhs, Rhs &&rhs) const
-          MPARK_RETURN(lib::forward<Lhs>(lhs) <= lib::forward<Rhs>(rhs))
-      };
-#endif
-
-#ifdef MPARK_TRANSPARENT_OPERATORS
-      using greater_equal = std::greater_equal<>;
-#else
-      struct greater_equal {
-        template <typename Lhs, typename Rhs>
-        inline constexpr auto operator()(Lhs &&lhs, Rhs &&rhs) const
-          MPARK_RETURN(lib::forward<Lhs>(lhs) >= lib::forward<Rhs>(rhs))
-      };
-#endif
-    }  // namespace cpp14
-
-    inline namespace cpp17 {
-
-      // <type_traits>
-      template <bool B>
-      using bool_constant = std::integral_constant<bool, B>;
-
-      template <typename...>
-      struct voider : identity<void> {};
-
-      template <typename... Ts>
-      using void_t = typename voider<Ts...>::type;
-
-      namespace detail_ {
-        namespace swappable {
-
-          using std::swap;
-
-          template <typename T>
-          struct is_swappable {
-            private:
-            template <typename U,
-                      typename = decltype(swap(std::declval<U &>(),
-                                               std::declval<U &>()))>
-            inline static std::true_type test(int);
-
-            template <typename U>
-            inline static std::false_type test(...);
-
-            public:
-            static constexpr bool value = decltype(test<T>(0))::value;
-          };
-
-          template <bool IsSwappable, typename T>
-          struct is_nothrow_swappable {
-            static constexpr bool value =
-                noexcept(swap(std::declval<T &>(), std::declval<T &>()));
-          };
-
-          template <typename T>
-          struct is_nothrow_swappable<false, T> : std::false_type {};
-
-        }  // namespace swappable
-      }  // namespace detail_
-
-      using detail_::swappable::is_swappable;
-
-      template <typename T>
-      using is_nothrow_swappable =
-          detail_::swappable::is_nothrow_swappable<is_swappable<T>::value, T>;
-
-      // <functional>
-      namespace detail_ {
-
-        template <typename T>
-        struct is_reference_wrapper : std::false_type {};
-
-        template <typename T>
-        struct is_reference_wrapper<std::reference_wrapper<T>>
-            : std::true_type {};
-
-        template <bool, int>
-        struct Invoke;
-
-        template <>
-        struct Invoke<true /* pmf */, 0 /* is_base_of */> {
-          template <typename R, typename T, typename Arg, typename... Args>
-          inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
-            MPARK_RETURN((lib::forward<Arg>(arg).*pmf)(lib::forward<Args>(args)...))
-        };
-
-        template <>
-        struct Invoke<true /* pmf */, 1 /* is_reference_wrapper */> {
-          template <typename R, typename T, typename Arg, typename... Args>
-          inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
-            MPARK_RETURN((lib::forward<Arg>(arg).get().*pmf)(lib::forward<Args>(args)...))
-        };
-
-        template <>
-        struct Invoke<true /* pmf */, 2 /* otherwise */> {
-          template <typename R, typename T, typename Arg, typename... Args>
-          inline static constexpr auto invoke(R T::*pmf, Arg &&arg, Args &&... args)
-            MPARK_RETURN(((*lib::forward<Arg>(arg)).*pmf)(lib::forward<Args>(args)...))
-        };
-
-        template <>
-        struct Invoke<false /* pmo */, 0 /* is_base_of */> {
-          template <typename R, typename T, typename Arg>
-          inline static constexpr auto invoke(R T::*pmo, Arg &&arg)
-            MPARK_RETURN(lib::forward<Arg>(arg).*pmo)
-        };
-
-        template <>
-        struct Invoke<false /* pmo */, 1 /* is_reference_wrapper */> {
-          template <typename R, typename T, typename Arg>
-          inline static constexpr auto invoke(R T::*pmo, Arg &&arg)
-            MPARK_RETURN(lib::forward<Arg>(arg).get().*pmo)
-        };
-
-        template <>
-        struct Invoke<false /* pmo */, 2 /* otherwise */> {
-          template <typename R, typename T, typename Arg>
-          inline static constexpr auto invoke(R T::*pmo, Arg &&arg)
-              MPARK_RETURN((*lib::forward<Arg>(arg)).*pmo)
-        };
-
-        template <typename R, typename T, typename Arg, typename... Args>
-        inline constexpr auto invoke(R T::*f, Arg &&arg, Args &&... args)
-          MPARK_RETURN(
-              Invoke<std::is_function<R>::value,
-                     (std::is_base_of<T, lib::decay_t<Arg>>::value
-                          ? 0
-                          : is_reference_wrapper<lib::decay_t<Arg>>::value
-                                ? 1
-                                : 2)>::invoke(f,
-                                              lib::forward<Arg>(arg),
-                                              lib::forward<Args>(args)...))
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4100)
-#endif
-        template <typename F, typename... Args>
-        inline constexpr auto invoke(F &&f, Args &&... args)
-          MPARK_RETURN(lib::forward<F>(f)(lib::forward<Args>(args)...))
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-      }  // namespace detail_
-
-      template <typename F, typename... Args>
-      inline constexpr auto invoke(F &&f, Args &&... args)
-        MPARK_RETURN(detail_::invoke(lib::forward<F>(f),
-                                    lib::forward<Args>(args)...))
-
-      namespace detail_ {
-
-        template <typename Void, typename, typename...>
-        struct invoke_result {};
-
-        template <typename F, typename... Args>
-        struct invoke_result<void_t<decltype(lib::invoke(
-                                 std::declval<F>(), std::declval<Args>()...))>,
-                             F,
-                             Args...>
-            : identity<decltype(
-                  lib::invoke(std::declval<F>(), std::declval<Args>()...))> {};
-
-      }  // namespace detail_
-
-      template <typename F, typename... Args>
-      using invoke_result = detail_::invoke_result<void, F, Args...>;
-
-      template <typename F, typename... Args>
-      using invoke_result_t = typename invoke_result<F, Args...>::type;
-
-      namespace detail_ {
-
-        template <typename Void, typename, typename...>
-        struct is_invocable : std::false_type {};
-
-        template <typename F, typename... Args>
-        struct is_invocable<void_t<invoke_result_t<F, Args...>>, F, Args...>
-            : std::true_type {};
-
-        template <typename Void, typename, typename, typename...>
-        struct is_invocable_r : std::false_type {};
-
-        template <typename R, typename F, typename... Args>
-        struct is_invocable_r<void_t<invoke_result_t<F, Args...>>,
-                              R,
-                              F,
-                              Args...>
-            : std::is_convertible<invoke_result_t<F, Args...>, R> {};
-
-      }  // namespace detail_
-
-      template <typename F, typename... Args>
-      using is_invocable = detail_::is_invocable<void, F, Args...>;
-
-      template <typename R, typename F, typename... Args>
-      using is_invocable_r = detail_::is_invocable_r<void, R, F, Args...>;
-
-      namespace detail_ {
-
-        template <bool Invocable, typename F, typename... Args>
-        struct is_nothrow_invocable {
-          static constexpr bool value =
-              noexcept(lib::invoke(std::declval<F>(), std::declval<Args>()...));
-        };
-
-        template <typename F, typename... Args>
-        struct is_nothrow_invocable<false, F, Args...> : std::false_type {};
-
-        template <bool Invocable, typename R, typename F, typename... Args>
-        struct is_nothrow_invocable_r {
-          private:
-          inline static R impl() {
-            return lib::invoke(std::declval<F>(), std::declval<Args>()...);
-          }
-
-          public:
-          static constexpr bool value = noexcept(impl());
-        };
-
-        template <typename R, typename F, typename... Args>
-        struct is_nothrow_invocable_r<false, R, F, Args...> : std::false_type {};
-
-      }  // namespace detail_
-
-      template <typename F, typename... Args>
-      using is_nothrow_invocable = detail_::
-          is_nothrow_invocable<is_invocable<F, Args...>::value, F, Args...>;
-
-      template <typename R, typename F, typename... Args>
-      using is_nothrow_invocable_r =
-          detail_::is_nothrow_invocable_r<is_invocable_r<R, F, Args...>::value,
-                                         R,
-                                         F,
-                                         Args...>;
-
-      // <memory>
-#ifdef MPARK_BUILTIN_ADDRESSOF
-      template <typename T>
-      inline constexpr T *addressof(T &arg) noexcept {
-        return __builtin_addressof(arg);
-      }
-#else
-      namespace detail_ {
-
-        namespace has_addressof_impl {
-
-          struct fail;
-
-          template <typename T>
-          inline fail operator&(T &&);
-
-          template <typename T>
-          inline static constexpr bool impl() {
-            return (std::is_class<T>::value || std::is_union<T>::value) &&
-                   !std::is_same<decltype(&std::declval<T &>()), fail>::value;
-          }
-
-        }  // namespace has_addressof_impl
-
-        template <typename T>
-        using has_addressof = bool_constant<has_addressof_impl::impl<T>()>;
-
-        template <typename T>
-        inline constexpr T *addressof(T &arg, std::true_type) noexcept {
-          return std::addressof(arg);
-        }
-
-        template <typename T>
-        inline constexpr T *addressof(T &arg, std::false_type) noexcept {
-          return &arg;
-        }
-
-      }  // namespace detail_
-
-      template <typename T>
-      inline constexpr T *addressof(T &arg) noexcept {
-        return detail_::addressof(arg, detail_::has_addressof<T>{});
-      }
-#endif
-
-      template <typename T>
-      inline constexpr T *addressof(const T &&) = delete;
-
-    }  // namespace cpp17
-
-    template <typename T>
-    struct remove_all_extents : identity<T> {};
-
-    template <typename T, std::size_t N>
-    struct remove_all_extents<array<T, N>> : remove_all_extents<T> {};
-
-    template <typename T>
-    using remove_all_extents_t = typename remove_all_extents<T>::type;
-
-    template <std::size_t N>
-    using size_constant = std::integral_constant<std::size_t, N>;
-
-    template <std::size_t I, typename T>
-    struct indexed_type : size_constant<I> { using type = T; };
-
-    template <bool... Bs>
-    using all = std::is_same<integer_sequence<bool, true, Bs...>,
-                             integer_sequence<bool, Bs..., true>>;
-
-#ifdef MPARK_TYPE_PACK_ELEMENT
-    template <std::size_t I, typename... Ts>
-    using type_pack_element_t = __type_pack_element<I, Ts...>;
-#else
-    template <std::size_t I, typename... Ts>
-    struct type_pack_element_impl {
-      private:
-      template <typename>
-      struct set;
-
-      template <std::size_t... Is>
-      struct set<index_sequence<Is...>> : indexed_type<Is, Ts>... {};
-
-      template <typename T>
-      inline static std::enable_if<true, T> impl(indexed_type<I, T>);
-
-      inline static std::enable_if<false> impl(...);
-
-      public:
-      using type = decltype(impl(set<index_sequence_for<Ts...>>{}));
-    };
-
-    template <std::size_t I, typename... Ts>
-    using type_pack_element = typename type_pack_element_impl<I, Ts...>::type;
-
-    template <std::size_t I, typename... Ts>
-    using type_pack_element_t = typename type_pack_element<I, Ts...>::type;
-#endif
-
-#ifdef MPARK_TRIVIALITY_TYPE_TRAITS
-    using std::is_trivially_copy_constructible;
-    using std::is_trivially_move_constructible;
-    using std::is_trivially_copy_assignable;
-    using std::is_trivially_move_assignable;
-#else
-    template <typename T>
-    struct is_trivially_copy_constructible
-        : bool_constant<
-              std::is_copy_constructible<T>::value && __has_trivial_copy(T)> {};
-
-    template <typename T>
-    struct is_trivially_move_constructible : bool_constant<__is_trivial(T)> {};
-
-    template <typename T>
-    struct is_trivially_copy_assignable
-        : bool_constant<
-              std::is_copy_assignable<T>::value && __has_trivial_assign(T)> {};
-
-    template <typename T>
-    struct is_trivially_move_assignable : bool_constant<__is_trivial(T)> {};
-#endif
-
-    template <typename T, bool>
-    struct dependent_type : T {};
-
-    template <typename Is, std::size_t J>
-    struct push_back;
-
-    template <typename Is, std::size_t J>
-    using push_back_t = typename push_back<Is, J>::type;
-
-    template <std::size_t... Is, std::size_t J>
-    struct push_back<index_sequence<Is...>, J> {
-      using type = index_sequence<Is..., J>;
-    };
-
-  }  // namespace lib
-}  // namespace c10
-
-#undef MPARK_RETURN
-
-#endif  // MPARK_LIB_HPP
-
-
-namespace c10 {
-
-#ifdef MPARK_RETURN_TYPE_DEDUCTION
-
-#define AUTO auto
-#define AUTO_RETURN(...) { return __VA_ARGS__; }
-
-#define AUTO_REFREF auto &&
-#define AUTO_REFREF_RETURN(...) { return __VA_ARGS__; }
-
-#define DECLTYPE_AUTO decltype(auto)
-#define DECLTYPE_AUTO_RETURN(...) { return __VA_ARGS__; }
-
-#else
-
-#define AUTO auto
-#define AUTO_RETURN(...) \
-  -> lib::decay_t<decltype(__VA_ARGS__)> { return __VA_ARGS__; }
-
-#define AUTO_REFREF auto
-#define AUTO_REFREF_RETURN(...)                                           \
-  -> decltype((__VA_ARGS__)) {                                            \
-    static_assert(std::is_reference<decltype((__VA_ARGS__))>::value, ""); \
-    return __VA_ARGS__;                                                   \
-  }
-
-#define DECLTYPE_AUTO auto
-#define DECLTYPE_AUTO_RETURN(...) \
-  -> decltype(__VA_ARGS__) { return __VA_ARGS__; }
-
-#endif
-
-  class bad_variant_access : public std::exception {
-    public:
-    virtual const char *what() const noexcept override { return "bad_variant_access"; }
-  };
-
-  [[noreturn]] inline void throw_bad_variant_access() {
-#ifdef MPARK_EXCEPTIONS
-    throw bad_variant_access{};
-#else
-    std::terminate();
-    MPARK_BUILTIN_UNREACHABLE;
-#endif
-  }
-
-  template <typename... Ts>
-  class variant;
-
-  template <typename T>
-  struct variant_size;
-
-#ifdef MPARK_VARIABLE_TEMPLATES
-  template <typename T>
-  constexpr std::size_t variant_size_v = variant_size<T>::value;
-#endif
-
-  template <typename T>
-  struct variant_size<const T> : variant_size<T> {};
-
-  template <typename T>
-  struct variant_size<volatile T> : variant_size<T> {};
-
-  template <typename T>
-  struct variant_size<const volatile T> : variant_size<T> {};
-
-  template <typename... Ts>
-  struct variant_size<variant<Ts...>> : lib::size_constant<sizeof...(Ts)> {};
-
-  template <std::size_t I, typename T>
-  struct variant_alternative;
-
-  template <std::size_t I, typename T>
-  using variant_alternative_t = typename variant_alternative<I, T>::type;
-
-  template <std::size_t I, typename T>
-  struct variant_alternative<I, const T>
-      : std::add_const<variant_alternative_t<I, T>> {};
-
-  template <std::size_t I, typename T>
-  struct variant_alternative<I, volatile T>
-      : std::add_volatile<variant_alternative_t<I, T>> {};
-
-  template <std::size_t I, typename T>
-  struct variant_alternative<I, const volatile T>
-      : std::add_cv<variant_alternative_t<I, T>> {};
-
-  template <std::size_t I, typename... Ts>
-  struct variant_alternative<I, variant<Ts...>> {
-    static_assert(I < sizeof...(Ts),
-                  "index out of bounds in `std::variant_alternative<>`");
-    using type = lib::type_pack_element_t<I, Ts...>;
-  };
-
-  constexpr std::size_t variant_npos = static_cast<std::size_t>(-1);
-
-  namespace detail_ {
-
-    constexpr std::size_t not_found = static_cast<std::size_t>(-1);
-    constexpr std::size_t ambiguous = static_cast<std::size_t>(-2);
-
-#ifdef MPARK_CPP14_CONSTEXPR
-    template <typename T, typename... Ts>
-    inline constexpr std::size_t find_index() {
-      constexpr lib::array<bool, sizeof...(Ts)> matches = {
-          {std::is_same<T, Ts>::value...}
-      };
-      std::size_t result = not_found;
-      for (std::size_t i = 0; i < sizeof...(Ts); ++i) {
-        if (matches[i]) {
-          if (result != not_found) {
-            return ambiguous;
-          }
-          result = i;
-        }
-      }
-      return result;
-    }
-#else
-    inline constexpr std::size_t find_index_impl(std::size_t result,
-                                                 std::size_t) {
-      return result;
-    }
-
-    template <typename... Bs>
-    inline constexpr std::size_t find_index_impl(std::size_t result,
-                                                 std::size_t idx,
-                                                 bool b,
-                                                 Bs... bs) {
-      return b ? (result != not_found ? ambiguous
-                                      : find_index_impl(idx, idx + 1, bs...))
-               : find_index_impl(result, idx + 1, bs...);
-    }
-
-    template <typename T, typename... Ts>
-    inline constexpr std::size_t find_index() {
-      return find_index_impl(not_found, 0, std::is_same<T, Ts>::value...);
-    }
-#endif
-
-    template <std::size_t I>
-    using find_index_sfinae_impl =
-        lib::enable_if_t<I != not_found && I != ambiguous,
-                         lib::size_constant<I>>;
-
-    template <typename T, typename... Ts>
-    using find_index_sfinae = find_index_sfinae_impl<find_index<T, Ts...>()>;
-
-    template <std::size_t I>
-    struct find_index_checked_impl : lib::size_constant<I> {
-      static_assert(I != not_found, "the specified type is not found.");
-      static_assert(I != ambiguous, "the specified type is ambiguous.");
-    };
-
-    template <typename T, typename... Ts>
-    using find_index_checked = find_index_checked_impl<find_index<T, Ts...>()>;
-
-    struct valueless_t {};
-
-    enum class Trait { TriviallyAvailable, Available, Unavailable };
-
-    template <typename T,
-              template <typename> class IsTriviallyAvailable,
-              template <typename> class IsAvailable>
-    inline constexpr Trait trait() {
-      return IsTriviallyAvailable<T>::value
-                 ? Trait::TriviallyAvailable
-                 : IsAvailable<T>::value ? Trait::Available
-                                         : Trait::Unavailable;
-    }
-
-#ifdef MPARK_CPP14_CONSTEXPR
-    template <typename... Traits>
-    inline constexpr Trait common_trait(Traits... traits_) {
-      Trait result = Trait::TriviallyAvailable;
-      lib::array<Trait, sizeof...(Traits)> traits = {{traits_...}};
-      for (std::size_t i = 0; i < sizeof...(Traits); ++i) {
-        Trait t = traits[i];
-        if (static_cast<int>(t) > static_cast<int>(result)) {
-          result = t;
-        }
-      }
-      return result;
-    }
-#else
-    inline constexpr Trait common_trait_impl(Trait result) { return result; }
-
-    template <typename... Traits>
-    inline constexpr Trait common_trait_impl(Trait result,
-                                             Trait t,
-                                             Traits... ts) {
-      return static_cast<int>(t) > static_cast<int>(result)
-                 ? common_trait_impl(t, ts...)
-                 : common_trait_impl(result, ts...);
-    }
-
-    template <typename... Traits>
-    inline constexpr Trait common_trait(Traits... ts) {
-      return common_trait_impl(Trait::TriviallyAvailable, ts...);
-    }
-#endif
-
-    template <typename... Ts>
-    struct traits {
-      static constexpr Trait copy_constructible_trait =
-          common_trait(trait<Ts,
-                             lib::is_trivially_copy_constructible,
-                             std::is_copy_constructible>()...);
-
-      static constexpr Trait move_constructible_trait =
-          common_trait(trait<Ts,
-                             lib::is_trivially_move_constructible,
-                             std::is_move_constructible>()...);
-
-      static constexpr Trait copy_assignable_trait =
-          common_trait(copy_constructible_trait,
-                       trait<Ts,
-                             lib::is_trivially_copy_assignable,
-                             std::is_copy_assignable>()...);
-
-      static constexpr Trait move_assignable_trait =
-          common_trait(move_constructible_trait,
-                       trait<Ts,
-                             lib::is_trivially_move_assignable,
-                             std::is_move_assignable>()...);
-
-      static constexpr Trait destructible_trait =
-          common_trait(trait<Ts,
-                             std::is_trivially_destructible,
-                             std::is_destructible>()...);
-    };
-
-    namespace access {
-
-      struct recursive_union {
-#ifdef MPARK_RETURN_TYPE_DEDUCTION
-        template <typename V>
-        inline static constexpr auto &&get_alt(V &&v, in_place_index_t<0>) {
-          return lib::forward<V>(v).head_;
-        }
-
-        template <typename V, std::size_t I>
-        inline static constexpr auto &&get_alt(V &&v, in_place_index_t<I>) {
-          return get_alt(lib::forward<V>(v).tail_, in_place_index_t<I - 1>{});
-        }
-#else
-        template <std::size_t I, bool Dummy = true>
-        struct get_alt_impl {
-          template <typename V>
-          inline constexpr AUTO_REFREF operator()(V &&v) const
-            AUTO_REFREF_RETURN(get_alt_impl<I - 1>{}(lib::forward<V>(v).tail_))
-        };
-
-        template <bool Dummy>
-        struct get_alt_impl<0, Dummy> {
-          template <typename V>
-          inline constexpr AUTO_REFREF operator()(V &&v) const
-            AUTO_REFREF_RETURN(lib::forward<V>(v).head_)
-        };
-
-        template <typename V, std::size_t I>
-        inline static constexpr AUTO_REFREF get_alt(V &&v, in_place_index_t<I>)
-          AUTO_REFREF_RETURN(get_alt_impl<I>{}(lib::forward<V>(v)))
-#endif
-      };
-
-      struct base {
-        template <std::size_t I, typename V>
-        inline static constexpr AUTO_REFREF get_alt(V &&v)
-#ifdef _MSC_VER
-          AUTO_REFREF_RETURN(recursive_union::get_alt(
-              lib::forward<V>(v).data_, in_place_index_t<I>{}))
-#else
-          AUTO_REFREF_RETURN(recursive_union::get_alt(
-              data(lib::forward<V>(v)), in_place_index_t<I>{}))
-#endif
-      };
-
-      struct variant {
-        template <std::size_t I, typename V>
-        inline static constexpr AUTO_REFREF get_alt(V &&v)
-          AUTO_REFREF_RETURN(base::get_alt<I>(lib::forward<V>(v).impl_))
-      };
-
-    }  // namespace access
-
-    namespace visitation {
-
-#if defined(MPARK_CPP14_CONSTEXPR) && !defined(_MSC_VER)
-#define MPARK_VARIANT_SWITCH_VISIT
-#endif
-
-      struct base {
-        template <typename Visitor, typename... Vs>
-        using dispatch_result_t = decltype(
-            lib::invoke(std::declval<Visitor>(),
-                        access::base::get_alt<0>(std::declval<Vs>())...));
-
-        template <typename Expected>
-        struct expected {
-          template <typename Actual>
-          inline static constexpr bool but_got() {
-            return std::is_same<Expected, Actual>::value;
-          }
-        };
-
-        template <typename Expected, typename Actual>
-        struct visit_return_type_check {
-          static_assert(
-              expected<Expected>::template but_got<Actual>(),
-              "`visit` requires the visitor to have a single return type");
-
-          template <typename Visitor, typename... Alts>
-          inline static constexpr DECLTYPE_AUTO invoke(Visitor &&visitor,
-                                                       Alts &&... alts)
-            DECLTYPE_AUTO_RETURN(lib::invoke(lib::forward<Visitor>(visitor),
-                                             lib::forward<Alts>(alts)...))
-        };
-
-#ifdef MPARK_VARIANT_SWITCH_VISIT
-        template <bool B, typename R, typename... ITs>
-        struct dispatcher;
-
-        template <typename R, typename... ITs>
-        struct dispatcher<false, R, ITs...> {
-          template <std::size_t B, typename F, typename... Vs>
-          MPARK_ALWAYS_INLINE static constexpr R dispatch(
-              F &&, typename ITs::type &&..., Vs &&...) {
-            MPARK_BUILTIN_UNREACHABLE;
-          }
-
-          template <std::size_t I, typename F, typename... Vs>
-          MPARK_ALWAYS_INLINE static constexpr R dispatch_case(F &&, Vs &&...) {
-            MPARK_BUILTIN_UNREACHABLE;
-          }
-
-          template <std::size_t B, typename F, typename... Vs>
-          MPARK_ALWAYS_INLINE static constexpr R dispatch_at(std::size_t,
-                                                             F &&,
-                                                             Vs &&...) {
-            MPARK_BUILTIN_UNREACHABLE;
-          }
-        };
-
-        template <typename R, typename... ITs>
-        struct dispatcher<true, R, ITs...> {
-          template <std::size_t B, typename F>
-          MPARK_ALWAYS_INLINE static constexpr R dispatch(
-              F &&f, typename ITs::type &&... visited_vs) {
-            using Expected = R;
-            using Actual = decltype(lib::invoke(
-                lib::forward<F>(f),
-                access::base::get_alt<ITs::value>(
-                    lib::forward<typename ITs::type>(visited_vs))...));
-            return visit_return_type_check<Expected, Actual>::invoke(
-                lib::forward<F>(f),
-                access::base::get_alt<ITs::value>(
-                    lib::forward<typename ITs::type>(visited_vs))...);
-          }
-
-          template <std::size_t B, typename F, typename V, typename... Vs>
-          MPARK_ALWAYS_INLINE static constexpr R dispatch(
-              F &&f, typename ITs::type &&... visited_vs, V &&v, Vs &&... vs) {
-#define MPARK_DISPATCH(I)                                                   \
-  dispatcher<(I < lib::decay_t<V>::size()),                                 \
-             R,                                                             \
-             ITs...,                                                        \
-             lib::indexed_type<I, V>>::                                     \
-      template dispatch<0>(lib::forward<F>(f),                              \
-                           lib::forward<typename ITs::type>(visited_vs)..., \
-                           lib::forward<V>(v),                              \
-                           lib::forward<Vs>(vs)...)
-
-#define MPARK_DEFAULT(I)                                                      \
-  dispatcher<(I < lib::decay_t<V>::size()), R, ITs...>::template dispatch<I>( \
-      lib::forward<F>(f),                                                     \
-      lib::forward<typename ITs::type>(visited_vs)...,                        \
-      lib::forward<V>(v),                                                     \
-      lib::forward<Vs>(vs)...)
-
-            switch (v.index()) {
-              case B + 0: return MPARK_DISPATCH(B + 0);
-              case B + 1: return MPARK_DISPATCH(B + 1);
-              case B + 2: return MPARK_DISPATCH(B + 2);
-              case B + 3: return MPARK_DISPATCH(B + 3);
-              case B + 4: return MPARK_DISPATCH(B + 4);
-              case B + 5: return MPARK_DISPATCH(B + 5);
-              case B + 6: return MPARK_DISPATCH(B + 6);
-              case B + 7: return MPARK_DISPATCH(B + 7);
-              case B + 8: return MPARK_DISPATCH(B + 8);
-              case B + 9: return MPARK_DISPATCH(B + 9);
-              case B + 10: return MPARK_DISPATCH(B + 10);
-              case B + 11: return MPARK_DISPATCH(B + 11);
-              case B + 12: return MPARK_DISPATCH(B + 12);
-              case B + 13: return MPARK_DISPATCH(B + 13);
-              case B + 14: return MPARK_DISPATCH(B + 14);
-              case B + 15: return MPARK_DISPATCH(B + 15);
-              case B + 16: return MPARK_DISPATCH(B + 16);
-              case B + 17: return MPARK_DISPATCH(B + 17);
-              case B + 18: return MPARK_DISPATCH(B + 18);
-              case B + 19: return MPARK_DISPATCH(B + 19);
-              case B + 20: return MPARK_DISPATCH(B + 20);
-              case B + 21: return MPARK_DISPATCH(B + 21);
-              case B + 22: return MPARK_DISPATCH(B + 22);
-              case B + 23: return MPARK_DISPATCH(B + 23);
-              case B + 24: return MPARK_DISPATCH(B + 24);
-              case B + 25: return MPARK_DISPATCH(B + 25);
-              case B + 26: return MPARK_DISPATCH(B + 26);
-              case B + 27: return MPARK_DISPATCH(B + 27);
-              case B + 28: return MPARK_DISPATCH(B + 28);
-              case B + 29: return MPARK_DISPATCH(B + 29);
-              case B + 30: return MPARK_DISPATCH(B + 30);
-              case B + 31: return MPARK_DISPATCH(B + 31);
-              default: return MPARK_DEFAULT(B + 32);
-            }
-
-#undef MPARK_DEFAULT
-#undef MPARK_DISPATCH
-          }
-
-          template <std::size_t I, typename F, typename... Vs>
-          MPARK_ALWAYS_INLINE static constexpr R dispatch_case(F &&f,
-                                                               Vs &&... vs) {
-            using Expected = R;
-            using Actual = decltype(
-                lib::invoke(lib::forward<F>(f),
-                            access::base::get_alt<I>(lib::forward<Vs>(vs))...));
-            return visit_return_type_check<Expected, Actual>::invoke(
-                lib::forward<F>(f),
-                access::base::get_alt<I>(lib::forward<Vs>(vs))...);
-          }
-
-          template <std::size_t B, typename F, typename V, typename... Vs>
-          MPARK_ALWAYS_INLINE static constexpr R dispatch_at(std::size_t index,
-                                                             F &&f,
-                                                             V &&v,
-                                                             Vs &&... vs) {
-            static_assert(lib::all<(lib::decay_t<V>::size() ==
-                                    lib::decay_t<Vs>::size())...>::value,
-                          "all of the variants must be the same size.");
-#define MPARK_DISPATCH_AT(I)                                               \
-  dispatcher<(I < lib::decay_t<V>::size()), R>::template dispatch_case<I>( \
-      lib::forward<F>(f), lib::forward<V>(v), lib::forward<Vs>(vs)...)
-
-#define MPARK_DEFAULT(I)                                                 \
-  dispatcher<(I < lib::decay_t<V>::size()), R>::template dispatch_at<I>( \
-      index, lib::forward<F>(f), lib::forward<V>(v), lib::forward<Vs>(vs)...)
-
-            switch (index) {
-              case B + 0: return MPARK_DISPATCH_AT(B + 0);
-              case B + 1: return MPARK_DISPATCH_AT(B + 1);
-              case B + 2: return MPARK_DISPATCH_AT(B + 2);
-              case B + 3: return MPARK_DISPATCH_AT(B + 3);
-              case B + 4: return MPARK_DISPATCH_AT(B + 4);
-              case B + 5: return MPARK_DISPATCH_AT(B + 5);
-              case B + 6: return MPARK_DISPATCH_AT(B + 6);
-              case B + 7: return MPARK_DISPATCH_AT(B + 7);
-              case B + 8: return MPARK_DISPATCH_AT(B + 8);
-              case B + 9: return MPARK_DISPATCH_AT(B + 9);
-              case B + 10: return MPARK_DISPATCH_AT(B + 10);
-              case B + 11: return MPARK_DISPATCH_AT(B + 11);
-              case B + 12: return MPARK_DISPATCH_AT(B + 12);
-              case B + 13: return MPARK_DISPATCH_AT(B + 13);
-              case B + 14: return MPARK_DISPATCH_AT(B + 14);
-              case B + 15: return MPARK_DISPATCH_AT(B + 15);
-              case B + 16: return MPARK_DISPATCH_AT(B + 16);
-              case B + 17: return MPARK_DISPATCH_AT(B + 17);
-              case B + 18: return MPARK_DISPATCH_AT(B + 18);
-              case B + 19: return MPARK_DISPATCH_AT(B + 19);
-              case B + 20: return MPARK_DISPATCH_AT(B + 20);
-              case B + 21: return MPARK_DISPATCH_AT(B + 21);
-              case B + 22: return MPARK_DISPATCH_AT(B + 22);
-              case B + 23: return MPARK_DISPATCH_AT(B + 23);
-              case B + 24: return MPARK_DISPATCH_AT(B + 24);
-              case B + 25: return MPARK_DISPATCH_AT(B + 25);
-              case B + 26: return MPARK_DISPATCH_AT(B + 26);
-              case B + 27: return MPARK_DISPATCH_AT(B + 27);
-              case B + 28: return MPARK_DISPATCH_AT(B + 28);
-              case B + 29: return MPARK_DISPATCH_AT(B + 29);
-              case B + 30: return MPARK_DISPATCH_AT(B + 30);
-              case B + 31: return MPARK_DISPATCH_AT(B + 31);
-              default: return MPARK_DEFAULT(B + 32);
-            }
-
-#undef MPARK_DEFAULT
-#undef MPARK_DISPATCH_AT
-          }
-        };
-#else
-        template <typename T>
-        inline static constexpr const T &at(const T &elem) noexcept {
-          return elem;
-        }
-
-        template <typename T, std::size_t N, typename... Is>
-        inline static constexpr const lib::remove_all_extents_t<T> &at(
-            const lib::array<T, N> &elems, std::size_t i, Is... is) noexcept {
-          return at(elems[i], is...);
-        }
-
-        template <typename F, typename... Fs>
-        inline static constexpr lib::array<lib::decay_t<F>, sizeof...(Fs) + 1>
-        make_farray(F &&f, Fs &&... fs) {
-          return {{lib::forward<F>(f), lib::forward<Fs>(fs)...}};
-        }
-
-        template <typename F, typename... Vs>
-        struct make_fmatrix_impl {
-
-          template <std::size_t... Is>
-          inline static constexpr dispatch_result_t<F, Vs...> dispatch(
-              F &&f, Vs &&... vs) {
-            using Expected = dispatch_result_t<F, Vs...>;
-            using Actual = decltype(lib::invoke(
-                lib::forward<F>(f),
-                access::base::get_alt<Is>(lib::forward<Vs>(vs))...));
-            return visit_return_type_check<Expected, Actual>::invoke(
-                lib::forward<F>(f),
-                access::base::get_alt<Is>(lib::forward<Vs>(vs))...);
-          }
-
-#ifdef MPARK_RETURN_TYPE_DEDUCTION
-          template <std::size_t... Is>
-          inline static constexpr auto impl(lib::index_sequence<Is...>) {
-            return &dispatch<Is...>;
-          }
-
-          template <typename Is, std::size_t... Js, typename... Ls>
-          inline static constexpr auto impl(Is,
-                                            lib::index_sequence<Js...>,
-                                            Ls... ls) {
-            return make_farray(impl(lib::push_back_t<Is, Js>{}, ls...)...);
-          }
-#else
-          template <typename...>
-          struct impl;
-
-          template <std::size_t... Is>
-          struct impl<lib::index_sequence<Is...>> {
-            inline constexpr AUTO operator()() const
-              AUTO_RETURN(&dispatch<Is...>)
-          };
-
-          template <typename Is, std::size_t... Js, typename... Ls>
-          struct impl<Is, lib::index_sequence<Js...>, Ls...> {
-            inline constexpr AUTO operator()() const
-              AUTO_RETURN(
-                  make_farray(impl<lib::push_back_t<Is, Js>, Ls...>{}()...))
-          };
-#endif
-        };
-
-#ifdef MPARK_RETURN_TYPE_DEDUCTION
-        template <typename F, typename... Vs>
-        inline static constexpr auto make_fmatrix() {
-          return make_fmatrix_impl<F, Vs...>::impl(
-              lib::index_sequence<>{},
-              lib::make_index_sequence<lib::decay_t<Vs>::size()>{}...);
-        }
-#else
-        template <typename F, typename... Vs>
-        inline static constexpr AUTO make_fmatrix()
-          AUTO_RETURN(
-              typename make_fmatrix_impl<F, Vs...>::template impl<
-                  lib::index_sequence<>,
-                  lib::make_index_sequence<lib::decay_t<Vs>::size()>...>{}())
-#endif
-
-        template <typename F, typename... Vs>
-        struct make_fdiagonal_impl {
-          template <std::size_t I>
-          inline static constexpr dispatch_result_t<F, Vs...> dispatch(
-              F &&f, Vs &&... vs) {
-            using Expected = dispatch_result_t<F, Vs...>;
-            using Actual = decltype(
-                lib::invoke(lib::forward<F>(f),
-                            access::base::get_alt<I>(lib::forward<Vs>(vs))...));
-            return visit_return_type_check<Expected, Actual>::invoke(
-                lib::forward<F>(f),
-                access::base::get_alt<I>(lib::forward<Vs>(vs))...);
-          }
-
-          template <std::size_t... Is>
-          inline static constexpr AUTO impl(lib::index_sequence<Is...>)
-            AUTO_RETURN(make_farray(&dispatch<Is>...))
-        };
-
-        template <typename F, typename V, typename... Vs>
-        inline static constexpr auto make_fdiagonal()
-            -> decltype(make_fdiagonal_impl<F, V, Vs...>::impl(
-                lib::make_index_sequence<lib::decay_t<V>::size()>{})) {
-          static_assert(lib::all<(lib::decay_t<V>::size() ==
-                                  lib::decay_t<Vs>::size())...>::value,
-                        "all of the variants must be the same size.");
-          return make_fdiagonal_impl<F, V, Vs...>::impl(
-              lib::make_index_sequence<lib::decay_t<V>::size()>{});
-        }
-#endif
-      };
-
-#if !defined(MPARK_VARIANT_SWITCH_VISIT) && \
-    (!defined(_MSC_VER) || _MSC_VER >= 1910)
-      template <typename F, typename... Vs>
-      using fmatrix_t = decltype(base::make_fmatrix<F, Vs...>());
-
-      template <typename F, typename... Vs>
-      struct fmatrix {
-        static constexpr fmatrix_t<F, Vs...> value =
-            base::make_fmatrix<F, Vs...>();
-      };
-
-      template <typename F, typename... Vs>
-      constexpr fmatrix_t<F, Vs...> fmatrix<F, Vs...>::value;
-
-      template <typename F, typename... Vs>
-      using fdiagonal_t = decltype(base::make_fdiagonal<F, Vs...>());
-
-      template <typename F, typename... Vs>
-      struct fdiagonal {
-        static constexpr fdiagonal_t<F, Vs...> value =
-            base::make_fdiagonal<F, Vs...>();
-      };
-
-      template <typename F, typename... Vs>
-      constexpr fdiagonal_t<F, Vs...> fdiagonal<F, Vs...>::value;
-#endif
-
-      struct alt {
-        template <typename Visitor, typename... Vs>
-        inline static constexpr DECLTYPE_AUTO visit_alt(Visitor &&visitor,
-                                                        Vs &&... vs)
-#ifdef MPARK_VARIANT_SWITCH_VISIT
-          DECLTYPE_AUTO_RETURN(
-              base::dispatcher<
-                  true,
-                  base::dispatch_result_t<Visitor,
-                                          decltype(as_base(
-                                              lib::forward<Vs>(vs)))...>>::
-                  template dispatch<0>(lib::forward<Visitor>(visitor),
-                                       as_base(lib::forward<Vs>(vs))...))
-#elif !defined(_MSC_VER) || _MSC_VER >= 1910
-          DECLTYPE_AUTO_RETURN(base::at(
-              fmatrix<Visitor &&,
-                      decltype(as_base(lib::forward<Vs>(vs)))...>::value,
-              vs.index()...)(lib::forward<Visitor>(visitor),
-                             as_base(lib::forward<Vs>(vs))...))
-#else
-          DECLTYPE_AUTO_RETURN(base::at(
-              base::make_fmatrix<Visitor &&,
-                      decltype(as_base(lib::forward<Vs>(vs)))...>(),
-              vs.index()...)(lib::forward<Visitor>(visitor),
-                             as_base(lib::forward<Vs>(vs))...))
-#endif
-
-        template <typename Visitor, typename... Vs>
-        inline static constexpr DECLTYPE_AUTO visit_alt_at(std::size_t index,
-                                                           Visitor &&visitor,
-                                                           Vs &&... vs)
-#ifdef MPARK_VARIANT_SWITCH_VISIT
-          DECLTYPE_AUTO_RETURN(
-              base::dispatcher<
-                  true,
-                  base::dispatch_result_t<Visitor,
-                                          decltype(as_base(
-                                              lib::forward<Vs>(vs)))...>>::
-                  template dispatch_at<0>(index,
-                                          lib::forward<Visitor>(visitor),
-                                          as_base(lib::forward<Vs>(vs))...))
-#elif !defined(_MSC_VER) || _MSC_VER >= 1910
-          DECLTYPE_AUTO_RETURN(base::at(
-              fdiagonal<Visitor &&,
-                        decltype(as_base(lib::forward<Vs>(vs)))...>::value,
-              index)(lib::forward<Visitor>(visitor),
-                     as_base(lib::forward<Vs>(vs))...))
-#else
-          DECLTYPE_AUTO_RETURN(base::at(
-              base::make_fdiagonal<Visitor &&,
-                        decltype(as_base(lib::forward<Vs>(vs)))...>(),
-              index)(lib::forward<Visitor>(visitor),
-                     as_base(lib::forward<Vs>(vs))...))
-#endif
-      };
-
-      struct variant {
-        private:
-        template <typename Visitor>
-        struct visitor {
-          template <typename... Values>
-          inline static constexpr bool does_not_handle() {
-            return lib::is_invocable<Visitor, Values...>::value;
-          }
-        };
-
-        template <typename Visitor, typename... Values>
-        struct visit_exhaustiveness_check {
-          static_assert(visitor<Visitor>::template does_not_handle<Values...>(),
-                        "`visit` requires the visitor to be exhaustive.");
-
-          inline static constexpr DECLTYPE_AUTO invoke(Visitor &&visitor,
-                                                       Values &&... values)
-            DECLTYPE_AUTO_RETURN(lib::invoke(lib::forward<Visitor>(visitor),
-                                             lib::forward<Values>(values)...))
-        };
-
-        template <typename Visitor>
-        struct value_visitor {
-          Visitor &&visitor_;
-
-          template <typename... Alts>
-          inline constexpr DECLTYPE_AUTO operator()(Alts &&... alts) const
-            DECLTYPE_AUTO_RETURN(
-                visit_exhaustiveness_check<
-                    Visitor,
-                    decltype((lib::forward<Alts>(alts).value))...>::
-                    invoke(lib::forward<Visitor>(visitor_),
-                           lib::forward<Alts>(alts).value...))
-        };
-
-        template <typename Visitor>
-        inline static constexpr AUTO make_value_visitor(Visitor &&visitor)
-          AUTO_RETURN(value_visitor<Visitor>{lib::forward<Visitor>(visitor)})
-
-        public:
-        template <typename Visitor, typename... Vs>
-        inline static constexpr DECLTYPE_AUTO visit_alt(Visitor &&visitor,
-                                                        Vs &&... vs)
-          DECLTYPE_AUTO_RETURN(alt::visit_alt(lib::forward<Visitor>(visitor),
-                                              lib::forward<Vs>(vs).impl_...))
-
-        template <typename Visitor, typename... Vs>
-        inline static constexpr DECLTYPE_AUTO visit_alt_at(std::size_t index,
-                                                           Visitor &&visitor,
-                                                           Vs &&... vs)
-          DECLTYPE_AUTO_RETURN(
-              alt::visit_alt_at(index,
-                                lib::forward<Visitor>(visitor),
-                                lib::forward<Vs>(vs).impl_...))
-
-        template <typename Visitor, typename... Vs>
-        inline static constexpr DECLTYPE_AUTO visit_value(Visitor &&visitor,
-                                                          Vs &&... vs)
-          DECLTYPE_AUTO_RETURN(
-              visit_alt(make_value_visitor(lib::forward<Visitor>(visitor)),
-                        lib::forward<Vs>(vs)...))
-
-        template <typename Visitor, typename... Vs>
-        inline static constexpr DECLTYPE_AUTO visit_value_at(std::size_t index,
-                                                             Visitor &&visitor,
-                                                             Vs &&... vs)
-          DECLTYPE_AUTO_RETURN(
-              visit_alt_at(index,
-                           make_value_visitor(lib::forward<Visitor>(visitor)),
-                           lib::forward<Vs>(vs)...))
-      };
-
-    }  // namespace visitation
-
-    template <std::size_t Index, typename T>
-    struct alt {
-      using value_type = T;
-
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4244)
-#endif
-      template <typename... Args>
-      inline explicit constexpr alt(variant_in_place_t, Args &&... args)
-          : value(lib::forward<Args>(args)...) {}
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
-      T value;
-    };
-
-    template <Trait DestructibleTrait, std::size_t Index, typename... Ts>
-    union recursive_union;
-
-    template <Trait DestructibleTrait, std::size_t Index>
-    union recursive_union<DestructibleTrait, Index> {};
-
-#define MPARK_VARIANT_RECURSIVE_UNION(destructible_trait, destructor)      \
-  template <std::size_t Index, typename T, typename... Ts>                 \
-  union recursive_union<destructible_trait, Index, T, Ts...> {             \
-    public:                                                                \
-    inline explicit constexpr recursive_union(valueless_t) noexcept        \
-        : dummy_{} {}                                                      \
-                                                                           \
-    template <typename... Args>                                            \
-    inline explicit constexpr recursive_union(in_place_index_t<0>,         \
-                                              Args &&... args)             \
-        : head_(variant_in_place_t{}, lib::forward<Args>(args)...) {}              \
-                                                                           \
-    template <std::size_t I, typename... Args>                             \
-    inline explicit constexpr recursive_union(in_place_index_t<I>,         \
-                                              Args &&... args)             \
-        : tail_(in_place_index_t<I - 1>{}, lib::forward<Args>(args)...) {} \
-                                                                           \
-    recursive_union(const recursive_union &) = default;                    \
-    recursive_union(recursive_union &&) = default;                         \
-                                                                           \
-    destructor                                                             \
-                                                                           \
-    recursive_union &operator=(const recursive_union &) = default;         \
-    recursive_union &operator=(recursive_union &&) = default;              \
-                                                                           \
-    private:                                                               \
-    char dummy_;                                                           \
-    alt<Index, T> head_;                                                   \
-    recursive_union<destructible_trait, Index + 1, Ts...> tail_;           \
-                                                                           \
-    friend struct access::recursive_union;                                 \
-  }
-
-    MPARK_VARIANT_RECURSIVE_UNION(Trait::TriviallyAvailable,
-                                  ~recursive_union() = default;);
-    MPARK_VARIANT_RECURSIVE_UNION(Trait::Available,
-                                  ~recursive_union() {});
-    MPARK_VARIANT_RECURSIVE_UNION(Trait::Unavailable,
-                                  ~recursive_union() = delete;);
-
-#undef MPARK_VARIANT_RECURSIVE_UNION
-
-    using index_t = unsigned int;
-
-    template <Trait DestructibleTrait, typename... Ts>
-    class base {
-      public:
-      inline explicit constexpr base(valueless_t tag) noexcept
-          : data_(tag), index_(static_cast<index_t>(-1)) {}
-
-      template <std::size_t I, typename... Args>
-      inline explicit constexpr base(in_place_index_t<I>, Args &&... args)
-          : data_(in_place_index_t<I>{}, lib::forward<Args>(args)...),
-            index_(I) {}
-
-      inline constexpr bool valueless_by_exception() const noexcept {
-        return index_ == static_cast<index_t>(-1);
-      }
-
-      inline constexpr std::size_t index() const noexcept {
-        return valueless_by_exception() ? variant_npos : index_;
-      }
-
-      protected:
-      using data_t = recursive_union<DestructibleTrait, 0, Ts...>;
-
-      friend inline constexpr base &as_base(base &b) { return b; }
-      friend inline constexpr const base &as_base(const base &b) { return b; }
-      friend inline constexpr base &&as_base(base &&b) { return lib::move(b); }
-      friend inline constexpr const base &&as_base(const base &&b) { return lib::move(b); }
-
-      friend inline constexpr data_t &data(base &b) { return b.data_; }
-      friend inline constexpr const data_t &data(const base &b) { return b.data_; }
-      friend inline constexpr data_t &&data(base &&b) { return lib::move(b).data_; }
-      friend inline constexpr const data_t &&data(const base &&b) { return lib::move(b).data_; }
-
-      inline static constexpr std::size_t size() { return sizeof...(Ts); }
-
-      data_t data_;
-      index_t index_;
-
-      friend struct access::base;
-      friend struct visitation::base;
-    };
-
-    struct dtor {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4100)
-#endif
-      template <typename Alt>
-      inline void operator()(Alt &alt) const noexcept { alt.~Alt(); }
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-    };
-
-#if !defined(_MSC_VER) || _MSC_VER >= 1910
-#define MPARK_INHERITING_CTOR(type, base) using base::base;
-#else
-#define MPARK_INHERITING_CTOR(type, base)         \
-  template <typename... Args>                     \
-  inline explicit constexpr type(Args &&... args) \
-      : base(lib::forward<Args>(args)...) {}
-#endif
-
-    template <typename Traits, Trait = Traits::destructible_trait>
-    class destructor;
-
-#define MPARK_VARIANT_DESTRUCTOR(destructible_trait, definition, destroy) \
-  template <typename... Ts>                                               \
-  class destructor<traits<Ts...>, destructible_trait>                     \
-      : public base<destructible_trait, Ts...> {                          \
-    using super = base<destructible_trait, Ts...>;                        \
-                                                                          \
-    public:                                                               \
-    MPARK_INHERITING_CTOR(destructor, super)                              \
-    using super::operator=;                                               \
-                                                                          \
-    destructor(const destructor &) = default;                             \
-    destructor(destructor &&) = default;                                  \
-    definition                                                            \
-    destructor &operator=(const destructor &) = default;                  \
-    destructor &operator=(destructor &&) = default;                       \
-                                                                          \
-    protected:                                                            \
-    destroy                                                               \
-  }
-
-    MPARK_VARIANT_DESTRUCTOR(
-        Trait::TriviallyAvailable,
-        ~destructor() = default;,
-        inline void destroy() noexcept {
-          this->index_ = static_cast<index_t>(-1);
-        });
-
-    MPARK_VARIANT_DESTRUCTOR(
-        Trait::Available,
-        ~destructor() { destroy(); },
-        inline void destroy() noexcept {
-          if (!this->valueless_by_exception()) {
-            visitation::alt::visit_alt(dtor{}, *this);
-          }
-          this->index_ = static_cast<index_t>(-1);
-        });
-
-    MPARK_VARIANT_DESTRUCTOR(
-        Trait::Unavailable,
-        ~destructor() = delete;,
-        inline void destroy() noexcept = delete;);
-
-#undef MPARK_VARIANT_DESTRUCTOR
-
-    template <typename Traits>
-    class constructor : public destructor<Traits> {
-      using super = destructor<Traits>;
-
-      public:
-      MPARK_INHERITING_CTOR(constructor, super)
-      using super::operator=;
-
-      protected:
-#ifndef MPARK_GENERIC_LAMBDAS
-      struct ctor {
-        template <typename LhsAlt, typename RhsAlt>
-        inline void operator()(LhsAlt &lhs_alt, RhsAlt &&rhs_alt) const {
-          constructor::construct_alt(lhs_alt,
-                                     lib::forward<RhsAlt>(rhs_alt).value);
-        }
-      };
-#endif
-
-      template <std::size_t I, typename T, typename... Args>
-      inline static T &construct_alt(alt<I, T> &a, Args &&... args) {
-        auto *result = ::new (static_cast<void *>(lib::addressof(a)))
-            alt<I, T>(variant_in_place_t{}, lib::forward<Args>(args)...);
-        return result->value;
-      }
-
-      template <typename Rhs>
-      inline static void generic_construct(constructor &lhs, Rhs &&rhs) {
-        lhs.destroy();
-        if (!rhs.valueless_by_exception()) {
-          visitation::alt::visit_alt_at(
-              rhs.index(),
-#ifdef MPARK_GENERIC_LAMBDAS
-              [](auto &lhs_alt, auto &&rhs_alt) {
-                constructor::construct_alt(
-                    lhs_alt, lib::forward<decltype(rhs_alt)>(rhs_alt).value);
-              }
-#else
-              ctor{}
-#endif
-              ,
-              lhs,
-              lib::forward<Rhs>(rhs));
-          lhs.index_ = rhs.index_;
-        }
-      }
-    };
-
-    template <typename Traits, Trait = Traits::move_constructible_trait>
-    class move_constructor;
-
-#define MPARK_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait, definition) \
-  template <typename... Ts>                                                  \
-  class move_constructor<traits<Ts...>, move_constructible_trait>            \
-      : public constructor<traits<Ts...>> {                                  \
-    using super = constructor<traits<Ts...>>;                                \
-                                                                             \
-    public:                                                                  \
-    MPARK_INHERITING_CTOR(move_constructor, super)                           \
-    using super::operator=;                                                  \
-                                                                             \
-    move_constructor(const move_constructor &) = default;                    \
-    definition                                                               \
-    ~move_constructor() = default;                                           \
-    move_constructor &operator=(const move_constructor &) = default;         \
-    move_constructor &operator=(move_constructor &&) = default;              \
-  }
-
-    MPARK_VARIANT_MOVE_CONSTRUCTOR(
-        Trait::TriviallyAvailable,
-        move_constructor(move_constructor &&that) = default;);
-
-    MPARK_VARIANT_MOVE_CONSTRUCTOR(
-        Trait::Available,
-        move_constructor(move_constructor &&that) noexcept(
-            lib::all<std::is_nothrow_move_constructible<Ts>::value...>::value)
-            : move_constructor(valueless_t{}) {
-          this->generic_construct(*this, lib::move(that));
-        });
-
-    MPARK_VARIANT_MOVE_CONSTRUCTOR(
-        Trait::Unavailable,
-        move_constructor(move_constructor &&) = delete;);
-
-#undef MPARK_VARIANT_MOVE_CONSTRUCTOR
-
-    template <typename Traits, Trait = Traits::copy_constructible_trait>
-    class copy_constructor;
-
-#define MPARK_VARIANT_COPY_CONSTRUCTOR(copy_constructible_trait, definition) \
-  template <typename... Ts>                                                  \
-  class copy_constructor<traits<Ts...>, copy_constructible_trait>            \
-      : public move_constructor<traits<Ts...>> {                             \
-    using super = move_constructor<traits<Ts...>>;                           \
-                                                                             \
-    public:                                                                  \
-    MPARK_INHERITING_CTOR(copy_constructor, super)                           \
-    using super::operator=;                                                  \
-                                                                             \
-    definition                                                               \
-    copy_constructor(copy_constructor &&) = default;                         \
-    ~copy_constructor() = default;                                           \
-    copy_constructor &operator=(const copy_constructor &) = default;         \
-    copy_constructor &operator=(copy_constructor &&) = default;              \
-  }
-
-    MPARK_VARIANT_COPY_CONSTRUCTOR(
-        Trait::TriviallyAvailable,
-        copy_constructor(const copy_constructor &that) = default;);
-
-    MPARK_VARIANT_COPY_CONSTRUCTOR(
-        Trait::Available,
-        copy_constructor(const copy_constructor &that)
-            : copy_constructor(valueless_t{}) {
-          this->generic_construct(*this, that);
-        });
-
-    MPARK_VARIANT_COPY_CONSTRUCTOR(
-        Trait::Unavailable,
-        copy_constructor(const copy_constructor &) = delete;);
-
-#undef MPARK_VARIANT_COPY_CONSTRUCTOR
-
-    template <typename Traits>
-    class assignment : public copy_constructor<Traits> {
-      using super = copy_constructor<Traits>;
-
-      public:
-      MPARK_INHERITING_CTOR(assignment, super)
-      using super::operator=;
-
-      template <std::size_t I, typename... Args>
-      inline /* auto & */ auto emplace(Args &&... args)
-          -> decltype(this->construct_alt(access::base::get_alt<I>(*this),
-                                          lib::forward<Args>(args)...)) {
-        this->destroy();
-        auto &result = this->construct_alt(access::base::get_alt<I>(*this),
-                                           lib::forward<Args>(args)...);
-        this->index_ = I;
-        return result;
-      }
-
-      protected:
-#ifndef MPARK_GENERIC_LAMBDAS
-      template <typename That>
-      struct assigner {
-        template <typename ThisAlt, typename ThatAlt>
-        inline void operator()(ThisAlt &this_alt, ThatAlt &&that_alt) const {
-          self->assign_alt(this_alt, lib::forward<ThatAlt>(that_alt).value);
-        }
-        assignment *self;
-      };
-#endif
-
-      template <std::size_t I, typename T, typename Arg>
-      inline void assign_alt(alt<I, T> &a, Arg &&arg) {
-        if (this->index() == I) {
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4244)
-#endif
-          a.value = lib::forward<Arg>(arg);
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-        } else {
-          struct {
-            void operator()(std::true_type) const {
-              this_->emplace<I>(lib::forward<Arg>(arg_));
-            }
-            void operator()(std::false_type) const {
-              this_->emplace<I>(T(lib::forward<Arg>(arg_)));
-            }
-            assignment *this_;
-            Arg &&arg_;
-          } impl{this, lib::forward<Arg>(arg)};
-          impl(lib::bool_constant<
-                   std::is_nothrow_constructible<T, Arg>::value ||
-                   !std::is_nothrow_move_constructible<T>::value>{});
-        }
-      }
-
-      template <typename That>
-      inline void generic_assign(That &&that) {
-        if (this->valueless_by_exception() && that.valueless_by_exception()) {
-          // do nothing.
-        } else if (that.valueless_by_exception()) {
-          this->destroy();
-        } else {
-          visitation::alt::visit_alt_at(
-              that.index(),
-#ifdef MPARK_GENERIC_LAMBDAS
-              [this](auto &this_alt, auto &&that_alt) {
-                this->assign_alt(
-                    this_alt, lib::forward<decltype(that_alt)>(that_alt).value);
-              }
-#else
-              assigner<That>{this}
-#endif
-              ,
-              *this,
-              lib::forward<That>(that));
-        }
-      }
-    };
-
-    template <typename Traits, Trait = Traits::move_assignable_trait>
-    class move_assignment;
-
-#define MPARK_VARIANT_MOVE_ASSIGNMENT(move_assignable_trait, definition) \
-  template <typename... Ts>                                              \
-  class move_assignment<traits<Ts...>, move_assignable_trait>            \
-      : public assignment<traits<Ts...>> {                               \
-    using super = assignment<traits<Ts...>>;                             \
-                                                                         \
-    public:                                                              \
-    MPARK_INHERITING_CTOR(move_assignment, super)                        \
-    using super::operator=;                                              \
-                                                                         \
-    move_assignment(const move_assignment &) = default;                  \
-    move_assignment(move_assignment &&) = default;                       \
-    ~move_assignment() = default;                                        \
-    move_assignment &operator=(const move_assignment &) = default;       \
-    definition                                                           \
-  }
-
-    MPARK_VARIANT_MOVE_ASSIGNMENT(
-        Trait::TriviallyAvailable,
-        move_assignment &operator=(move_assignment &&that) = default;);
-
-    MPARK_VARIANT_MOVE_ASSIGNMENT(
-        Trait::Available,
-        move_assignment &
-        operator=(move_assignment &&that) noexcept(
-            lib::all<(std::is_nothrow_move_constructible<Ts>::value &&
-                      std::is_nothrow_move_assignable<Ts>::value)...>::value) {
-          this->generic_assign(lib::move(that));
-          return *this;
-        });
-
-    MPARK_VARIANT_MOVE_ASSIGNMENT(
-        Trait::Unavailable,
-        move_assignment &operator=(move_assignment &&) = delete;);
-
-#undef MPARK_VARIANT_MOVE_ASSIGNMENT
-
-    template <typename Traits, Trait = Traits::copy_assignable_trait>
-    class copy_assignment;
-
-#define MPARK_VARIANT_COPY_ASSIGNMENT(copy_assignable_trait, definition) \
-  template <typename... Ts>                                              \
-  class copy_assignment<traits<Ts...>, copy_assignable_trait>            \
-      : public move_assignment<traits<Ts...>> {                          \
-    using super = move_assignment<traits<Ts...>>;                        \
-                                                                         \
-    public:                                                              \
-    MPARK_INHERITING_CTOR(copy_assignment, super)                        \
-    using super::operator=;                                              \
-                                                                         \
-    copy_assignment(const copy_assignment &) = default;                  \
-    copy_assignment(copy_assignment &&) = default;                       \
-    ~copy_assignment() = default;                                        \
-    definition                                                           \
-    copy_assignment &operator=(copy_assignment &&) = default;            \
-  }
-
-    MPARK_VARIANT_COPY_ASSIGNMENT(
-        Trait::TriviallyAvailable,
-        copy_assignment &operator=(const copy_assignment &that) = default;);
-
-    MPARK_VARIANT_COPY_ASSIGNMENT(
-        Trait::Available,
-        copy_assignment &operator=(const copy_assignment &that) {
-          this->generic_assign(that);
-          return *this;
-        });
-
-    MPARK_VARIANT_COPY_ASSIGNMENT(
-        Trait::Unavailable,
-        copy_assignment &operator=(const copy_assignment &) = delete;);
-
-#undef MPARK_VARIANT_COPY_ASSIGNMENT
-
-    template <typename... Ts>
-    class impl : public copy_assignment<traits<Ts...>> {
-      using super = copy_assignment<traits<Ts...>>;
-
-      public:
-      MPARK_INHERITING_CTOR(impl, super)
-      using super::operator=;
-
-      template <std::size_t I, typename Arg>
-      inline void assign(Arg &&arg) {
-        this->assign_alt(access::base::get_alt<I>(*this),
-                         lib::forward<Arg>(arg));
-      }
-
-      inline void swap(impl &that) {
-        if (this->valueless_by_exception() && that.valueless_by_exception()) {
-          // do nothing.
-        } else if (this->index() == that.index()) {
-          visitation::alt::visit_alt_at(this->index(),
-#ifdef MPARK_GENERIC_LAMBDAS
-                                        [](auto &this_alt, auto &that_alt) {
-                                          using std::swap;
-                                          swap(this_alt.value,
-                                               that_alt.value);
-                                        }
-#else
-                                        swapper{}
-#endif
-                                        ,
-                                        *this,
-                                        that);
-        } else {
-          impl *lhs = this;
-          impl *rhs = lib::addressof(that);
-          if (lhs->move_nothrow() && !rhs->move_nothrow()) {
-            std::swap(lhs, rhs);
-          }
-          impl tmp(lib::move(*rhs));
-#ifdef MPARK_EXCEPTIONS
-          // EXTENSION: When the move construction of `lhs` into `rhs` throws
-          // and `tmp` is nothrow move constructible then we move `tmp` back
-          // into `rhs` and provide the strong exception safety guarantee.
-          try {
-            this->generic_construct(*rhs, lib::move(*lhs));
-          } catch (...) {
-            if (tmp.move_nothrow()) {
-              this->generic_construct(*rhs, lib::move(tmp));
-            }
-            throw;
-          }
-#else
-          this->generic_construct(*rhs, lib::move(*lhs));
-#endif
-          this->generic_construct(*lhs, lib::move(tmp));
-        }
-      }
-
-      private:
-#ifndef MPARK_GENERIC_LAMBDAS
-      struct swapper {
-        template <typename ThisAlt, typename ThatAlt>
-        inline void operator()(ThisAlt &this_alt, ThatAlt &that_alt) const {
-          using std::swap;
-          swap(this_alt.value, that_alt.value);
-        }
-      };
-#endif
-
-      inline constexpr bool move_nothrow() const {
-        return this->valueless_by_exception() ||
-               lib::array<bool, sizeof...(Ts)>{
-                   {std::is_nothrow_move_constructible<Ts>::value...}
-               }[this->index()];
-      }
-    };
-
-#undef MPARK_INHERITING_CTOR
-
-    template <std::size_t I, typename T>
-    struct overload_leaf {
-      using F = lib::size_constant<I> (*)(T);
-      operator F() const { return nullptr; }
-    };
-
-    template <typename... Ts>
-    struct overload_impl {
-      private:
-      template <typename>
-      struct impl;
-
-      template <std::size_t... Is>
-      struct impl<lib::index_sequence<Is...>> : overload_leaf<Is, Ts>... {};
-
-      public:
-      using type = impl<lib::index_sequence_for<Ts...>>;
-    };
-
-    template <typename... Ts>
-    using overload = typename overload_impl<Ts...>::type;
-
-    template <typename T, typename... Ts>
-    using best_match = lib::invoke_result_t<overload<Ts...>, T &&>;
-
-    template <typename T>
-    struct is_in_place_index : std::false_type {};
-
-    template <std::size_t I>
-    struct is_in_place_index<in_place_index_t<I>> : std::true_type {};
-
-    template <typename T>
-    struct is_in_place_type : std::false_type {};
-
-    template <typename T>
-    struct is_in_place_type<in_place_type_t<T>> : std::true_type {};
-
-  }  // detail_
-
-  template <typename... Ts>
-  class variant {
-    static_assert(0 < sizeof...(Ts),
-                  "variant must consist of at least one alternative.");
-
-    static_assert(lib::all<!std::is_array<Ts>::value...>::value,
-                  "variant can not have an array type as an alternative.");
-
-    static_assert(lib::all<!std::is_reference<Ts>::value...>::value,
-                  "variant can not have a reference type as an alternative.");
-
-    static_assert(lib::all<!std::is_void<Ts>::value...>::value,
-                  "variant can not have a void type as an alternative.");
-
-    public:
-    template <
-        typename Front = lib::type_pack_element_t<0, Ts...>,
-        lib::enable_if_t<std::is_default_constructible<Front>::value, int> = 0>
-    inline constexpr variant() noexcept(
-        std::is_nothrow_default_constructible<Front>::value)
-        : impl_(in_place_index_t<0>{}) {}
-
-    variant(const variant &) = default;
-    variant(variant &&) = default;
-
-    template <
-        typename Arg,
-        typename Decayed = lib::decay_t<Arg>,
-        lib::enable_if_t<!std::is_same<Decayed, variant>::value, int> = 0,
-        lib::enable_if_t<!detail_::is_in_place_index<Decayed>::value, int> = 0,
-        lib::enable_if_t<!detail_::is_in_place_type<Decayed>::value, int> = 0,
-        std::size_t I = detail_::best_match<Arg, Ts...>::value,
-        typename T = lib::type_pack_element_t<I, Ts...>,
-        lib::enable_if_t<std::is_constructible<T, Arg>::value, int> = 0>
-    inline constexpr variant(Arg &&arg) noexcept(
-        std::is_nothrow_constructible<T, Arg>::value)
-        : impl_(in_place_index_t<I>{}, lib::forward<Arg>(arg)) {}
-
-    template <
-        std::size_t I,
-        typename... Args,
-        typename T = lib::type_pack_element_t<I, Ts...>,
-        lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-    inline explicit constexpr variant(
-        in_place_index_t<I>,
-        Args &&... args) noexcept(std::is_nothrow_constructible<T,
-                                                                Args...>::value)
-        : impl_(in_place_index_t<I>{}, lib::forward<Args>(args)...) {}
-
-    template <
-        std::size_t I,
-        typename Up,
-        typename... Args,
-        typename T = lib::type_pack_element_t<I, Ts...>,
-        lib::enable_if_t<std::is_constructible<T,
-                                               std::initializer_list<Up> &,
-                                               Args...>::value,
-                         int> = 0>
-    inline explicit constexpr variant(
-        in_place_index_t<I>,
-        std::initializer_list<Up> il,
-        Args &&... args) noexcept(std::
-                                      is_nothrow_constructible<
-                                          T,
-                                          std::initializer_list<Up> &,
-                                          Args...>::value)
-        : impl_(in_place_index_t<I>{}, il, lib::forward<Args>(args)...) {}
-
-    template <
-        typename T,
-        typename... Args,
-        std::size_t I = detail_::find_index_sfinae<T, Ts...>::value,
-        lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-    inline explicit constexpr variant(
-        in_place_type_t<T>,
-        Args &&... args) noexcept(std::is_nothrow_constructible<T,
-                                                                Args...>::value)
-        : impl_(in_place_index_t<I>{}, lib::forward<Args>(args)...) {}
-
-    template <
-        typename T,
-        typename Up,
-        typename... Args,
-        std::size_t I = detail_::find_index_sfinae<T, Ts...>::value,
-        lib::enable_if_t<std::is_constructible<T,
-                                               std::initializer_list<Up> &,
-                                               Args...>::value,
-                         int> = 0>
-    inline explicit constexpr variant(
-        in_place_type_t<T>,
-        std::initializer_list<Up> il,
-        Args &&... args) noexcept(std::
-                                      is_nothrow_constructible<
-                                          T,
-                                          std::initializer_list<Up> &,
-                                          Args...>::value)
-        : impl_(in_place_index_t<I>{}, il, lib::forward<Args>(args)...) {}
-
-    ~variant() = default;
-
-    variant &operator=(const variant &) = default;
-    variant &operator=(variant &&) = default;
-
-    template <typename Arg,
-              lib::enable_if_t<!std::is_same<lib::decay_t<Arg>, variant>::value,
-                               int> = 0,
-              std::size_t I = detail_::best_match<Arg, Ts...>::value,
-              typename T = lib::type_pack_element_t<I, Ts...>,
-              lib::enable_if_t<(std::is_assignable<T &, Arg>::value &&
-                                std::is_constructible<T, Arg>::value),
-                               int> = 0>
-    inline variant &operator=(Arg &&arg) noexcept(
-        (std::is_nothrow_assignable<T &, Arg>::value &&
-         std::is_nothrow_constructible<T, Arg>::value)) {
-      impl_.template assign<I>(lib::forward<Arg>(arg));
-      return *this;
-    }
-
-    template <
-        std::size_t I,
-        typename... Args,
-        typename T = lib::type_pack_element_t<I, Ts...>,
-        lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-    inline T &emplace(Args &&... args) {
-      return impl_.template emplace<I>(lib::forward<Args>(args)...);
-    }
-
-    template <
-        std::size_t I,
-        typename Up,
-        typename... Args,
-        typename T = lib::type_pack_element_t<I, Ts...>,
-        lib::enable_if_t<std::is_constructible<T,
-                                               std::initializer_list<Up> &,
-                                               Args...>::value,
-                         int> = 0>
-    inline T &emplace(std::initializer_list<Up> il, Args &&... args) {
-      return impl_.template emplace<I>(il, lib::forward<Args>(args)...);
-    }
-
-    template <
-        typename T,
-        typename... Args,
-        std::size_t I = detail_::find_index_sfinae<T, Ts...>::value,
-        lib::enable_if_t<std::is_constructible<T, Args...>::value, int> = 0>
-    inline T &emplace(Args &&... args) {
-      return impl_.template emplace<I>(lib::forward<Args>(args)...);
-    }
-
-    template <
-        typename T,
-        typename Up,
-        typename... Args,
-        std::size_t I = detail_::find_index_sfinae<T, Ts...>::value,
-        lib::enable_if_t<std::is_constructible<T,
-                                               std::initializer_list<Up> &,
-                                               Args...>::value,
-                         int> = 0>
-    inline T &emplace(std::initializer_list<Up> il, Args &&... args) {
-      return impl_.template emplace<I>(il, lib::forward<Args>(args)...);
-    }
-
-    inline constexpr bool valueless_by_exception() const noexcept {
-      return impl_.valueless_by_exception();
-    }
-
-    inline constexpr std::size_t index() const noexcept {
-      return impl_.index();
-    }
-
-    template <bool Dummy = true,
-              lib::enable_if_t<
-                  lib::all<Dummy,
-                           (lib::dependent_type<std::is_move_constructible<Ts>,
-                                                Dummy>::value &&
-                            lib::dependent_type<lib::is_swappable<Ts>,
-                                                Dummy>::value)...>::value,
-                  int> = 0>
-    inline void swap(variant &that) noexcept(
-        lib::all<(std::is_nothrow_move_constructible<Ts>::value &&
-                  lib::is_nothrow_swappable<Ts>::value)...>::value) {
-      impl_.swap(that.impl_);
-    }
-
-    private:
-    detail_::impl<Ts...> impl_;
-
-    friend struct detail_::access::variant;
-    friend struct detail_::visitation::variant;
-  };
-
-  template <std::size_t I, typename... Ts>
-  inline constexpr bool holds_alternative(const variant<Ts...> &v) noexcept {
-    return v.index() == I;
-  }
-
-  template <typename T, typename... Ts>
-  inline constexpr bool holds_alternative(const variant<Ts...> &v) noexcept {
-    return holds_alternative<detail_::find_index_checked<T, Ts...>::value>(v);
-  }
-
-  namespace detail_ {
-    template <std::size_t I, typename V>
-    struct generic_get_impl {
-      constexpr generic_get_impl(int) noexcept {}
-
-      constexpr AUTO_REFREF operator()(V &&v) const
-        AUTO_REFREF_RETURN(
-            access::variant::get_alt<I>(lib::forward<V>(v)).value)
-    };
-
-    template <std::size_t I, typename V>
-    inline constexpr AUTO_REFREF generic_get(V &&v)
-      AUTO_REFREF_RETURN(generic_get_impl<I, V>(
-          holds_alternative<I>(v) ? 0 : (throw_bad_variant_access(), 0))(
-          lib::forward<V>(v)))
-  }  // namespace detail_
-
-  template <std::size_t I, typename... Ts>
-  inline constexpr variant_alternative_t<I, variant<Ts...>> &get(
-      variant<Ts...> &v) {
-    return detail_::generic_get<I>(v);
-  }
-
-  template <std::size_t I, typename... Ts>
-  inline constexpr variant_alternative_t<I, variant<Ts...>> &&get(
-      variant<Ts...> &&v) {
-    return detail_::generic_get<I>(lib::move(v));
-  }
-
-  template <std::size_t I, typename... Ts>
-  inline constexpr const variant_alternative_t<I, variant<Ts...>> &get(
-      const variant<Ts...> &v) {
-    return detail_::generic_get<I>(v);
-  }
-
-  template <std::size_t I, typename... Ts>
-  inline constexpr const variant_alternative_t<I, variant<Ts...>> &&get(
-      const variant<Ts...> &&v) {
-    return detail_::generic_get<I>(lib::move(v));
-  }
-
-  template <typename T, typename... Ts>
-  inline constexpr T &get(variant<Ts...> &v) {
-    return get<detail_::find_index_checked<T, Ts...>::value>(v);
-  }
-
-  template <typename T, typename... Ts>
-  inline constexpr T &&get(variant<Ts...> &&v) {
-    return get<detail_::find_index_checked<T, Ts...>::value>(lib::move(v));
-  }
-
-  template <typename T, typename... Ts>
-  inline constexpr const T &get(const variant<Ts...> &v) {
-    return get<detail_::find_index_checked<T, Ts...>::value>(v);
-  }
-
-  template <typename T, typename... Ts>
-  inline constexpr const T &&get(const variant<Ts...> &&v) {
-    return get<detail_::find_index_checked<T, Ts...>::value>(lib::move(v));
-  }
-
-  namespace detail_ {
-
-    template <std::size_t I, typename V>
-    inline constexpr /* auto * */ AUTO generic_get_if(V *v) noexcept
-      AUTO_RETURN(v && holds_alternative<I>(*v)
-                      ? lib::addressof(access::variant::get_alt<I>(*v).value)
-                      : nullptr)
-
-  }  // namespace detail_
-
-  template <std::size_t I, typename... Ts>
-  inline constexpr lib::add_pointer_t<variant_alternative_t<I, variant<Ts...>>>
-  get_if(variant<Ts...> *v) noexcept {
-    return detail_::generic_get_if<I>(v);
-  }
-
-  template <std::size_t I, typename... Ts>
-  inline constexpr lib::add_pointer_t<
-      const variant_alternative_t<I, variant<Ts...>>>
-  get_if(const variant<Ts...> *v) noexcept {
-    return detail_::generic_get_if<I>(v);
-  }
-
-  template <typename T, typename... Ts>
-  inline constexpr lib::add_pointer_t<T>
-  get_if(variant<Ts...> *v) noexcept {
-    return get_if<detail_::find_index_checked<T, Ts...>::value>(v);
-  }
-
-  template <typename T, typename... Ts>
-  inline constexpr lib::add_pointer_t<const T>
-  get_if(const variant<Ts...> *v) noexcept {
-    return get_if<detail_::find_index_checked<T, Ts...>::value>(v);
-  }
-
-  namespace detail_ {
-    template <typename RelOp>
-    struct convert_to_bool {
-      template <typename Lhs, typename Rhs>
-      inline constexpr bool operator()(Lhs &&lhs, Rhs &&rhs) const {
-        static_assert(std::is_convertible<lib::invoke_result_t<RelOp, Lhs, Rhs>,
-                                          bool>::value,
-                      "relational operators must return a type"
-                      " implicitly convertible to bool");
-        return lib::invoke(
-            RelOp{}, lib::forward<Lhs>(lhs), lib::forward<Rhs>(rhs));
-      }
-    };
-  }  // namespace detail_
-
-  template <typename... Ts>
-  inline constexpr bool operator==(const variant<Ts...> &lhs,
-                                   const variant<Ts...> &rhs) {
-    using detail_::visitation::variant;
-    using equal_to = detail_::convert_to_bool<lib::equal_to>;
-#ifdef MPARK_CPP14_CONSTEXPR
-    if (lhs.index() != rhs.index()) return false;
-    if (lhs.valueless_by_exception()) return true;
-    return variant::visit_value_at(lhs.index(), equal_to{}, lhs, rhs);
-#else
-    return lhs.index() == rhs.index() &&
-           (lhs.valueless_by_exception() ||
-            variant::visit_value_at(lhs.index(), equal_to{}, lhs, rhs));
-#endif
-  }
-
-  template <typename... Ts>
-  inline constexpr bool operator!=(const variant<Ts...> &lhs,
-                                   const variant<Ts...> &rhs) {
-    using detail_::visitation::variant;
-    using not_equal_to = detail_::convert_to_bool<lib::not_equal_to>;
-#ifdef MPARK_CPP14_CONSTEXPR
-    if (lhs.index() != rhs.index()) return true;
-    if (lhs.valueless_by_exception()) return false;
-    return variant::visit_value_at(lhs.index(), not_equal_to{}, lhs, rhs);
-#else
-    return lhs.index() != rhs.index() ||
-           (!lhs.valueless_by_exception() &&
-            variant::visit_value_at(lhs.index(), not_equal_to{}, lhs, rhs));
-#endif
-  }
-
-  template <typename... Ts>
-  inline constexpr bool operator<(const variant<Ts...> &lhs,
-                                  const variant<Ts...> &rhs) {
-    using detail_::visitation::variant;
-    using less = detail_::convert_to_bool<lib::less>;
-#ifdef MPARK_CPP14_CONSTEXPR
-    if (rhs.valueless_by_exception()) return false;
-    if (lhs.valueless_by_exception()) return true;
-    if (lhs.index() < rhs.index()) return true;
-    if (lhs.index() > rhs.index()) return false;
-    return variant::visit_value_at(lhs.index(), less{}, lhs, rhs);
-#else
-    return !rhs.valueless_by_exception() &&
-           (lhs.valueless_by_exception() || lhs.index() < rhs.index() ||
-            (lhs.index() == rhs.index() &&
-             variant::visit_value_at(lhs.index(), less{}, lhs, rhs)));
-#endif
-  }
-
-  template <typename... Ts>
-  inline constexpr bool operator>(const variant<Ts...> &lhs,
-                                  const variant<Ts...> &rhs) {
-    using detail_::visitation::variant;
-    using greater = detail_::convert_to_bool<lib::greater>;
-#ifdef MPARK_CPP14_CONSTEXPR
-    if (lhs.valueless_by_exception()) return false;
-    if (rhs.valueless_by_exception()) return true;
-    if (lhs.index() > rhs.index()) return true;
-    if (lhs.index() < rhs.index()) return false;
-    return variant::visit_value_at(lhs.index(), greater{}, lhs, rhs);
-#else
-    return !lhs.valueless_by_exception() &&
-           (rhs.valueless_by_exception() || lhs.index() > rhs.index() ||
-            (lhs.index() == rhs.index() &&
-             variant::visit_value_at(lhs.index(), greater{}, lhs, rhs)));
-#endif
-  }
-
-  template <typename... Ts>
-  inline constexpr bool operator<=(const variant<Ts...> &lhs,
-                                   const variant<Ts...> &rhs) {
-    using detail_::visitation::variant;
-    using less_equal = detail_::convert_to_bool<lib::less_equal>;
-#ifdef MPARK_CPP14_CONSTEXPR
-    if (lhs.valueless_by_exception()) return true;
-    if (rhs.valueless_by_exception()) return false;
-    if (lhs.index() < rhs.index()) return true;
-    if (lhs.index() > rhs.index()) return false;
-    return variant::visit_value_at(lhs.index(), less_equal{}, lhs, rhs);
-#else
-    return lhs.valueless_by_exception() ||
-           (!rhs.valueless_by_exception() &&
-            (lhs.index() < rhs.index() ||
-             (lhs.index() == rhs.index() &&
-              variant::visit_value_at(lhs.index(), less_equal{}, lhs, rhs))));
-#endif
-  }
-
-  template <typename... Ts>
-  inline constexpr bool operator>=(const variant<Ts...> &lhs,
-                                   const variant<Ts...> &rhs) {
-    using detail_::visitation::variant;
-    using greater_equal = detail_::convert_to_bool<lib::greater_equal>;
-#ifdef MPARK_CPP14_CONSTEXPR
-    if (rhs.valueless_by_exception()) return true;
-    if (lhs.valueless_by_exception()) return false;
-    if (lhs.index() > rhs.index()) return true;
-    if (lhs.index() < rhs.index()) return false;
-    return variant::visit_value_at(lhs.index(), greater_equal{}, lhs, rhs);
-#else
-    return rhs.valueless_by_exception() ||
-           (!lhs.valueless_by_exception() &&
-            (lhs.index() > rhs.index() ||
-             (lhs.index() == rhs.index() &&
-              variant::visit_value_at(
-                  lhs.index(), greater_equal{}, lhs, rhs))));
-#endif
-  }
-
-  struct monostate {};
-
-  inline constexpr bool operator<(monostate, monostate) noexcept {
-    return false;
-  }
-
-  inline constexpr bool operator>(monostate, monostate) noexcept {
-    return false;
-  }
-
-  inline constexpr bool operator<=(monostate, monostate) noexcept {
-    return true;
-  }
-
-  inline constexpr bool operator>=(monostate, monostate) noexcept {
-    return true;
-  }
-
-  inline constexpr bool operator==(monostate, monostate) noexcept {
-    return true;
-  }
-
-  inline constexpr bool operator!=(monostate, monostate) noexcept {
-    return false;
-  }
-
-#ifdef MPARK_CPP14_CONSTEXPR
-  namespace detail_ {
-
-    inline constexpr bool all(std::initializer_list<bool> bs) {
-      for (bool b : bs) {
-        if (!b) {
-          return false;
-        }
-      }
-      return true;
-    }
-
-  }  // namespace detail_
-
-  template <typename Visitor, typename... Vs>
-  inline constexpr decltype(auto) visit(Visitor &&visitor, Vs &&... vs) {
-    return (detail_::all({!vs.valueless_by_exception()...})
-                ? (void)0
-                : throw_bad_variant_access()),
-           detail_::visitation::variant::visit_value(
-               lib::forward<Visitor>(visitor), lib::forward<Vs>(vs)...);
-  }
-#else
-  namespace detail_ {
-
-    template <std::size_t N>
-    inline constexpr bool all_impl(const lib::array<bool, N> &bs,
-                                   std::size_t idx) {
-      return idx >= N || (bs[idx] && all_impl(bs, idx + 1));
-    }
-
-    template <std::size_t N>
-    inline constexpr bool all(const lib::array<bool, N> &bs) {
-      return all_impl(bs, 0);
-    }
-
-  }  // namespace detail_
-
-  template <typename Visitor, typename... Vs>
-  inline constexpr DECLTYPE_AUTO visit(Visitor &&visitor, Vs &&... vs)
-    DECLTYPE_AUTO_RETURN(
-        (detail_::all(
-             lib::array<bool, sizeof...(Vs)>{{!vs.valueless_by_exception()...}})
-             ? (void)0
-             : throw_bad_variant_access()),
-        detail_::visitation::variant::visit_value(lib::forward<Visitor>(visitor),
-                                                 lib::forward<Vs>(vs)...))
-#endif
-
-  template <typename... Ts>
-  inline auto swap(variant<Ts...> &lhs,
-                   variant<Ts...> &rhs) noexcept(noexcept(lhs.swap(rhs)))
-      -> decltype(lhs.swap(rhs)) {
-    lhs.swap(rhs);
-  }
-
-  namespace detail_ {
-
-    template <typename T, typename...>
-    using enabled_type = T;
-
-    namespace hash {
-
-      template <typename H, typename K>
-      constexpr bool meets_requirements() noexcept {
-        return std::is_copy_constructible<H>::value &&
-               std::is_move_constructible<H>::value &&
-               lib::is_invocable_r<std::size_t, H, const K &>::value;
-      }
-
-      template <typename K>
-      constexpr bool is_enabled() noexcept {
-        using H = std::hash<K>;
-        return meets_requirements<H, K>() &&
-               std::is_default_constructible<H>::value &&
-               std::is_copy_assignable<H>::value &&
-               std::is_move_assignable<H>::value;
-      }
-
-    }  // namespace hash
-
-  }  // namespace detail_
-
-#undef AUTO
-#undef AUTO_RETURN
-
-#undef AUTO_REFREF
-#undef AUTO_REFREF_RETURN
-
-#undef DECLTYPE_AUTO
-#undef DECLTYPE_AUTO_RETURN
-
-}  // namespace c10
-
-namespace std {
-
-  template <typename... Ts>
-  struct hash<c10::detail_::enabled_type<
-      c10::variant<Ts...>,
-      c10::lib::enable_if_t<c10::lib::all<c10::detail_::hash::is_enabled<
-          c10::lib::remove_const_t<Ts>>()...>::value>>> {
-    using argument_type = c10::variant<Ts...>;
-    using result_type = std::size_t;
-
-    inline result_type operator()(const argument_type &v) const {
-      using c10::detail_::visitation::variant;
-      std::size_t result =
-          v.valueless_by_exception()
-              ? 299792458  // Random value chosen by the universe upon creation
-              : variant::visit_alt(
-#ifdef MPARK_GENERIC_LAMBDAS
-                    [](const auto &alt) {
-                      using alt_type = c10::lib::decay_t<decltype(alt)>;
-                      using value_type = c10::lib::remove_const_t<
-                          typename alt_type::value_type>;
-                      return hash<value_type>{}(alt.value);
-                    }
-#else
-                    hasher{}
-#endif
-                    ,
-                    v);
-      return hash_combine(result, hash<std::size_t>{}(v.index()));
-    }
-
-    private:
-#ifndef MPARK_GENERIC_LAMBDAS
-    struct hasher {
-      template <typename Alt>
-      inline std::size_t operator()(const Alt &alt) const {
-        using alt_type = c10::lib::decay_t<Alt>;
-        using value_type =
-            c10::lib::remove_const_t<typename alt_type::value_type>;
-        return hash<value_type>{}(alt.value);
-      }
-    };
-#endif
-
-    static std::size_t hash_combine(std::size_t lhs, std::size_t rhs) {
-      return lhs ^= rhs + 0x9e3779b9 + (lhs << 6) + (lhs >> 2);
-    }
-  };
-
-  template <>
-  struct hash<c10::monostate> {
-    using argument_type = c10::monostate;
-    using result_type = std::size_t;
-
-    inline result_type operator()(const argument_type &) const noexcept {
-      return 66740831;  // return a fundamentally attractive random value.
-    }
-  };
-
-}  // namespace std
-
-#endif  // C10_UTIL_VARIANT_H_

From c2223df5789d917513911a475ab5bd682772ca3f Mon Sep 17 00:00:00 2001
From: Kevin Chen <kchen357@fb.com>
Date: Thu, 3 Oct 2019 09:37:42 -0700
Subject: [PATCH 013/113] Implement LpNorm regularizer to be used on the inputs
 for feature importance (#26376)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26376

* Create the new dense_feature_reg (FCInputLpNorm) for feature importance to be applied to the fully-connected layer for feature-importance.

Test Plan: * Unit test located in: `caffe2/caffe2/fb/dper/layer_models/tests/split_1/sparse_nn_test.py`

Reviewed By: un-disclosed

Differential Revision: D17360361

fbshipit-source-id: 1a0e119eeb17199a13dfffe58b3036ea4255e301
---
 caffe2/python/regularizer.py | 49 ++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py
index 5214a2e016310..563033c509104 100644
--- a/caffe2/python/regularizer.py
+++ b/caffe2/python/regularizer.py
@@ -42,6 +42,9 @@ def _run_on_loss(self, net, param_init_net, param, grad=None):
     def _run_after_optimizer(self, net, param_init_net, param, grad):
         return None
 
+    def _feature_grouping(self, param, net):
+        return None
+
     def _ensure_clipped(
         self,
         net,
@@ -84,6 +87,52 @@ def _run_on_loss(self, net, param_init_net, param, grad=None):
         net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
         return output_blob
 
+class FCInputLpNorm(Regularizer):
+    def __init__(self, reg_lambda, p_value=0.5):
+        super(FCInputLpNorm, self).__init__()
+        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
+        assert p_value >= 0, "p_value factor should be 0 or positive"
+        self.p_value = p_value
+        self.reg_lambda = reg_lambda
+
+    def _feature_grouping(self, param, net):
+        # Possible alternative grouping method via summing over absolute values
+        # Compute l2norm over feature weights
+        # pow( sum_i { pow(theda_i, 2) } ,  0.5)
+        param_mul = net.Mul([param, param], [net.NextScopedBlob("param_mul")])
+        param_reduced = net.ReduceFrontSum(
+            [param_mul], [net.NextScopedBlob("param_reduced")]
+        )
+        grouped_feature_weight_vec = net.Pow(
+            [param_reduced],
+            [net.NextScopedBlob("grouped_feature_weight_vec")],
+            exponent=0.5,
+        )
+
+        return grouped_feature_weight_vec
+
+    def _run_on_loss(self, net, param_init_net, param, grad=None):
+        # TODO: the second dim (num of input nodes) of param is after feature preproc,
+        # and does not correspond to the original num of dense features.
+        # In the future, will want to create a util to reduce the input dim of param to
+        # match the num of dense features.
+
+        output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
+        grouped_feature_weight_vec = self._feature_grouping(param, net)
+
+        # Compute Lpnorm over l2norm:
+        # pow( sum_i { pow(theda_i, p) } ,  1/p)
+        lp_vec_raised = net.Pow(
+            [grouped_feature_weight_vec], [net.NextScopedBlob("lp_vec_raised")], exponent=self.p_value
+        )
+        lp_vec_summed = net.ReduceFrontSum(
+            [lp_vec_raised], [net.NextScopedBlob("lp_vec_summed")]
+        )
+        lp_vec = net.Pow(
+            [lp_vec_summed], [net.NextScopedBlob("lp_vec")], exponent=(1 / self.p_value)
+        )
+        net.Scale([lp_vec], [output_blob], scale=self.reg_lambda)
+        return output_blob
 
 class L1NormTrimmed(Regularizer):
     """

From 214a676f6b5b2688af3d939e5f5c322a68834af5 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Thu, 3 Oct 2019 10:32:00 -0700
Subject: [PATCH 014/113] Provide (but skip) 3.5 job by default on all PRs.
 (#27293)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27293

This doesn't turn on 3.5 signal, but it makes it so that [test all]
will include it if you do request it.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Differential Revision: D17738741

Pulled By: ezyang

fbshipit-source-id: 2b1af4d7bf26fd84a593fde292d6bfa2aabc1148
---
 .circleci/cimodel/data/pytorch_build_data.py |  2 +-
 .circleci/config.yml                         | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/.circleci/cimodel/data/pytorch_build_data.py b/.circleci/cimodel/data/pytorch_build_data.py
index ef328d1668014..9ebb2334dba5d 100644
--- a/.circleci/cimodel/data/pytorch_build_data.py
+++ b/.circleci/cimodel/data/pytorch_build_data.py
@@ -8,7 +8,7 @@
         (None, [
             XImportant("2.7.9"),
             X("2.7"),
-            X("3.5"),
+            XImportant("3.5"),  # Not run on all PRs, but should be included on [test all]
             X("nightly"),
         ]),
         ("gcc", [
diff --git a/.circleci/config.yml b/.circleci/config.yml
index fcc1c5df7eb95..a10e63046c795 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1518,11 +1518,6 @@ workflows:
           name: pytorch_linux_xenial_py3_5_build
           requires:
             - setup
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
           build_environment: "pytorch-linux-xenial-py3.5-build"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.5:347"
       - pytorch_linux_test:
@@ -1530,11 +1525,6 @@ workflows:
           requires:
             - setup
             - pytorch_linux_xenial_py3_5_build
-          filters:
-            branches:
-              only:
-                - master
-                - /ci-all\/.*/
           build_environment: "pytorch-linux-xenial-py3.5-test"
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.5:347"
           resource_class: large

From 1bc7ea17b2c45691c36894a7e5dad31296f4e2d7 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 3 Oct 2019 10:38:07 -0700
Subject: [PATCH 015/113] more profiler changes in C++ before enabling
 checkScript changes

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/26909

Differential Revision: D17683632

Pulled By: Krovatkin

fbshipit-source-id: 5d36c3c4cf7411c56485ef19fe59262b9f8b45b2
---
 test/cpp/jit/test_misc.cpp                    |   2 +-
 test/test_jit.py                              |  13 +--
 torch/csrc/jit/graph_executor.h               |   6 +-
 torch/csrc/jit/ir.cpp                         |   1 +
 torch/csrc/jit/passes/alias_analysis.cpp      |  17 ++-
 .../jit/profiling_graph_executor_impl.cpp     |  42 ++++++-
 torch/csrc/jit/profiling_record.cpp           | 104 +++++++++++++-----
 torch/csrc/jit/profiling_record.h             |   1 +
 8 files changed, 133 insertions(+), 53 deletions(-)

diff --git a/test/cpp/jit/test_misc.cpp b/test/cpp/jit/test_misc.cpp
index d131d3b80bd87..27e5a376c7b42 100644
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@@ -1049,7 +1049,7 @@ void testInsertAndEliminateRedundantGuards() {
   checkShape(*guard, {2, 3}, false);
   auto is_guard = [](Node* n) { return n->kind() == prim::Guard; };
   int num_guards = std::count_if(nodes.begin(), nodes.end(), is_guard);
-  ASSERT_EQ(num_guards, 11);
+  ASSERT_EQ(num_guards, 12);
   // now eliminate as many guards as possible
   // we should be left with two guards on x and y's defs
   EliminateRedundantGuards(copy);
diff --git a/test/test_jit.py b/test/test_jit.py
index 6757eb394b343..c8afca1d9b24f 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -5006,18 +5006,11 @@ def def_in_one_branch(x, z):
         a = torch.rand(2, 3)
 
         with enable_profiling_mode():
-            # the first calls are profiled
-            def_in_one_branch(a, False)
-            # check prim::profile are inserted
-            profiled_graph_str = str(def_in_one_branch.graph_for(a, True))
+            # the first call is profiled
+            profiled_graph_str = str(def_in_one_branch.graph_for(a, False))
             FileCheck().check_count("prim::profile", 4).run(profiled_graph_str)
+            # the second call is optimized
             def_in_one_branch(a, False)
-            def_in_one_branch(a, False)
-            # this call is optimized for
-            # the given shape of (2, 3)
-            def_in_one_branch(a, False)
-            # change shape to (3)
-            # so we go down a bailout path
             a = torch.ones(3)
             # check prim::BailOuts are inserted
             bailout_graph_str = str(def_in_one_branch.graph_for(a, True))
diff --git a/torch/csrc/jit/graph_executor.h b/torch/csrc/jit/graph_executor.h
index bcbd817602566..14cc8376b3f16 100644
--- a/torch/csrc/jit/graph_executor.h
+++ b/torch/csrc/jit/graph_executor.h
@@ -1,11 +1,13 @@
 #pragma once
 
+#include <atomic>
+#include <memory>
+
 #include <torch/csrc/jit/argument_spec.h>
 #include <torch/csrc/jit/interpreter.h>
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/variable_tensor_list.h>
 #include <torch/csrc/jit/update_graph_executor_opt.h>
-#include <memory>
 
 namespace torch {
 namespace jit {
@@ -58,7 +60,7 @@ TORCH_API void runRequiredPasses(const std::shared_ptr<Graph>& g);
 TORCH_API void debugSetAutodiffSubgraphInlining(bool state);
 TORCH_API std::shared_ptr<Graph> lastExecutedOptimizedGraph();
 
-TORCH_API bool& getProfilingMode();
+TORCH_API std::atomic<bool> &getProfilingMode();
 
 struct TORCH_API GraphOptimizerEnabledGuard {
   GraphOptimizerEnabledGuard(bool state)
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 15edbe625e7fb..8493d11e70d74 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -928,6 +928,7 @@ bool Node::hasSideEffects() const {
     case prim::CallFunction:
     case prim::CallMethod:
     case prim::BailoutTemplate:
+    case prim::profile:
       return true;
   }
 
diff --git a/torch/csrc/jit/passes/alias_analysis.cpp b/torch/csrc/jit/passes/alias_analysis.cpp
index 991f97478f0c4..05b8940bf776b 100644
--- a/torch/csrc/jit/passes/alias_analysis.cpp
+++ b/torch/csrc/jit/passes/alias_analysis.cpp
@@ -345,12 +345,17 @@ void AliasDb::analyzeImpl(Node* node) {
     case prim::SetAttr:
       return analyzeSetAttr(node);
     case prim::profile:
-      AT_ERROR("Analyzing prim::profile isn't yet implemented");
-      // TODO: simply mapping inputs' aliases to outputs'
-      // should work but a) we should probably avoid exposing
-      // prim::profile to optimizations b) the alias semantics
-      // might be more complicated than just mapAliases
-      // mapAliases(node->inputs(), node->outputs());
+      if (node->inputs().size() > 0) {
+        makePointerTo(node->output(), node->inputs().at(0));
+      }
+      return;
+    case prim::BailOut:
+      TORCH_INTERNAL_ASSERT(node->inputs().at(0)->node()->kind() ==
+                            prim::BailoutTemplate);
+      makePointerTo(node->output(), node->inputs().at(1));
+      return;
+    case prim::Guard:
+      makePointerTo(node->output(), node->inputs().at(0));
       return;
     case prim::CallFunction:
     case prim::CallMethod:
diff --git a/torch/csrc/jit/profiling_graph_executor_impl.cpp b/torch/csrc/jit/profiling_graph_executor_impl.cpp
index 26030fca5014e..4b3c9779b610c 100644
--- a/torch/csrc/jit/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/profiling_graph_executor_impl.cpp
@@ -1,3 +1,4 @@
+#include <torch/csrc/jit/jit_log.h>
 #include <torch/csrc/jit/passes/bailout_graph.h>
 #include <torch/csrc/jit/passes/canonicalize_ops.h>
 #include <torch/csrc/jit/passes/constant_propagation.h>
@@ -7,6 +8,7 @@
 #include <torch/csrc/jit/passes/inline_autodiff_subgraphs.h>
 #include <torch/csrc/jit/passes/insert_guards.h>
 #include <torch/csrc/jit/passes/lower_grad_of.h>
+#include <torch/csrc/jit/passes/remove_expands.h>
 #include <torch/csrc/jit/passes/requires_grad_analysis.h>
 #include <torch/csrc/jit/passes/shape_analysis.h>
 #include <torch/csrc/jit/passes/specialize_autogradzero.h>
@@ -15,8 +17,8 @@
 namespace torch {
 namespace jit {
 
-thread_local bool profiling_mode = false;
-bool& getProfilingMode() {
+static std::atomic<bool> profiling_mode{false};
+std::atomic<bool>& getProfilingMode() {
   return profiling_mode;
 }
 
@@ -58,7 +60,12 @@ ExecutionPlan ProfilingGraphExecutorImpl::getPlanFor(Stack& stack) {
 
   if (!pr_) {
     pr_ = ProfilingRecord::instrumentGraph(prepareGraph(graph, stack));
-    profiling_plan_ = ExecutionPlan(pr_->profiled_graph_);
+    auto copy = pr_->graph()->copy();
+    LowerGradOf(*copy);
+    RemoveExpands(copy);
+    CanonicalizeOps(copy);
+    EliminateDeadCode(copy);
+    profiling_plan_ = ExecutionPlan(copy);
     // fall-through
   }
 
@@ -68,6 +75,12 @@ ExecutionPlan ProfilingGraphExecutorImpl::getPlanFor(Stack& stack) {
 
   // copy already has differentiableGraphs
   auto copy = pr_->graph()->copy();
+  if (!getGraphExecutorOptimize()) {
+    runRequiredPasses(copy);
+    optimized_plan_ = ExecutionPlan(copy);
+    return *optimized_plan_;
+  }
+
   // insert bailouts
   InsertGuards(copy);
   // get rid of autograd specific ops
@@ -83,10 +96,29 @@ ExecutionPlan ProfilingGraphExecutorImpl::getPlanFor(Stack& stack) {
   CanonicalizeOps(copy);
   EliminateRedundantGuards(copy);
   InsertBailOuts(copy);
-  // regular optimizations
+  // TODO: this runs specializeAutogradZero ??
+  GRAPH_DUMP("After InsertBailOuts: ", copy);
+  runRequiredPasses(copy);
   ConstantPropagation(copy);
   runOptimization(copy);
-  runNondiffOptimization(copy);
+  if (needsGradient(copy)) {
+    auto diff_nodes = CreateAutodiffSubgraphs(
+        copy,
+        getAutodiffSubgraphInlining() ? autodiffSubgraphNodeThreshold : 1);
+    for (Node *dnode : diff_nodes) {
+      auto diff_graph = std::move(dnode->g(attr::Subgraph));
+      Gradient gradient = differentiate(diff_graph);
+      runOptimization(gradient.f);
+      // run non diff optimization on the forward graph
+      runNondiffOptimization(gradient.f);
+      packGradient(gradient, dnode);
+    }
+    InlineAutodiffSubgraphs(copy, getAutodiffSubgraphInlining()
+                                      ? autodiffSubgraphInlineThreshold
+                                      : 1);
+  } else {
+    runNondiffOptimization(copy);
+  }
   EliminateDeadCode(copy);
   // cache
   optimized_plan_ = ExecutionPlan(copy);
diff --git a/torch/csrc/jit/profiling_record.cpp b/torch/csrc/jit/profiling_record.cpp
index 07f0b203b18d2..dc2355ce8adef 100644
--- a/torch/csrc/jit/profiling_record.cpp
+++ b/torch/csrc/jit/profiling_record.cpp
@@ -5,7 +5,7 @@ namespace torch {
 namespace jit {
 
 ProfilingRecord::ProfilingRecord(std::shared_ptr<Graph> g)
-    : profiled_graph_(std::move(g)), profiling_count_(3) {}
+    : profiled_graph_(std::move(g)), profiling_count_(1) {}
 
 ProfileOp* ProfilingRecord::createProfileNode(
     const std::function<void(Stack&)>& fp,
@@ -18,7 +18,71 @@ ProfileOp* ProfilingRecord::createProfileNode(
   return pn;
 }
 
-void ProfilingRecord::instrumentBlock(Block* block) {
+static void unprofileGraphInputs(const std::shared_ptr<Graph> &graph) {
+  for (auto i : graph->inputs()) {
+    if (i->type()->isSubtypeOf(TensorType::get())) {
+      i->setType(unshapedType(i->type()));
+    }
+  }
+}
+
+static void unprofileBlock(Block* start_block) {
+  std::vector<Block*> stack;
+  stack.push_back(start_block);
+
+  while (!stack.empty()) {
+    Block* block = stack.back();
+    stack.pop_back();
+
+    for (auto n : block->nodes()) {
+      for (auto o : n->outputs()) {
+        if (o->type()->isSubtypeOf(TensorType::get())) {
+          o->setType(unshapedType(o->type()));
+        }
+      }
+      stack.insert(stack.end(), n->blocks().begin(), n->blocks().end());
+    }
+  }
+}
+
+void ProfilingRecord::insertShapeProfile(Node *n, Value *i) {
+
+  auto pn = createProfileNode(nullptr, {i});
+  auto pno = pn->addOutput();
+  bool first = true;
+  pno->setType(TensorType::get());
+  std::function<void(Stack &)> shape_profiler = [this, pno,
+                                                 first](Stack &stack) mutable {
+    IValue t;
+    pop(stack, t);
+    if (t.isTensor()) {
+
+      if (t.toTensor().defined()) {
+        auto pttp = TensorType::create(t.toTensor());
+        std::lock_guard<std::mutex> lock(this->mutex_);
+        if (auto type = pno->type()->cast<TensorType>()) {
+          if (!first) {
+            pttp = pttp->merge(type);
+          }
+          pno->setType(pttp);
+          first = false;
+        }
+      } else {
+        pno->setType(TensorType::get()->withUndefined());
+      }
+    }
+
+    // passing t through
+    push(stack, t);
+
+  };
+
+  pn->setCallback(shape_profiler);
+  pn->insertBefore(n);
+  n->replaceInputWith(i, pn->output());
+}
+
+void ProfilingRecord::instrumentBlock(Block *block) {
   for (auto it = block->nodes().begin(); it != block->nodes().end(); ++it) {
     auto n = *it;
     for (auto i : n->inputs()) {
@@ -27,32 +91,7 @@ void ProfilingRecord::instrumentBlock(Block* block) {
         continue;
       }
 
-      auto pn = createProfileNode(nullptr, {i});
-      auto pno = pn->addOutput();
-      bool first = true;
-      pno->setType(TensorType::get());
-      std::function<void(Stack&)> shape_profiler =
-          [this, pno, first](Stack& stack) mutable {
-            IValue t;
-            pop(stack, t);
-            if (t.isTensor()) {
-              auto pttp = TensorType::create(t.toTensor());
-              std::lock_guard<std::mutex> lock(this->mutex_);
-              if (auto type = pno->type()->cast<TensorType>()) {
-                if (!first) {
-                  pttp = pttp->merge(type);
-                }
-                pno->setType(pttp);
-                first = false;
-              }
-            }
-            // passing t through
-            push(stack, t);
-          };
-
-      pn->setCallback(shape_profiler);
-      pn->insertBefore(n);
-      n->replaceInputWith(i, pn->output());
+      insertShapeProfile(n, i);
     }
 
     for (auto b : n->blocks()) {
@@ -66,8 +105,15 @@ std::unique_ptr<ProfilingRecord> ProfilingRecord::instrumentGraph(
   auto new_g = graph->copy();
   auto pr = std::unique_ptr<ProfilingRecord>(new ProfilingRecord(new_g));
   auto raw_pr = pr.get();
-
+  unprofileGraphInputs(new_g);
+  unprofileBlock(new_g->block());
   pr->instrumentBlock(new_g->block());
+
+  for (auto i : new_g->return_node()->inputs()) {
+    if (i->type()->isSubtypeOf(TensorType::get())) {
+      pr->insertShapeProfile(new_g->return_node(), i);
+    }
+  }
   std::function<void(Stack&)> counter = [raw_pr](Stack&) {
     std::lock_guard<std::mutex> lock(raw_pr->mutex_);
     if (raw_pr->profiling_count_ > 0)
diff --git a/torch/csrc/jit/profiling_record.h b/torch/csrc/jit/profiling_record.h
index 8a15ff4c55dbe..73a332de1f59a 100644
--- a/torch/csrc/jit/profiling_record.h
+++ b/torch/csrc/jit/profiling_record.h
@@ -39,6 +39,7 @@ struct ProfilingRecord {
       const std::function<void(Stack&)>& fp,
       at::ArrayRef<Value*> inputs);
   void instrumentBlock(Block* block);
+  void insertShapeProfile(Node *n, Value *i);
   ProfilingRecord(std::shared_ptr<Graph> g);
 };
 

From 882b2abb804b8993eb089486ce536a5dc03dd8f0 Mon Sep 17 00:00:00 2001
From: Nikolay Korovaiko <korovaikon@gmail.com>
Date: Thu, 3 Oct 2019 11:04:50 -0700
Subject: [PATCH 016/113] Fix segfault while printing value type for an error
 msg in emitListComprehension

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27261

Differential Revision: D17740159

Pulled By: Krovatkin

fbshipit-source-id: 90439282aea14d8634eb41ffece5b6320d615fa7
---
 torch/csrc/jit/script/compiler.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 245e20fd37e2c..c64408eed21be 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -1135,8 +1135,9 @@ struct to_ir {
       list_type = getListCompType(lc, IntType::get());
     } else {
       throw ErrorReport(lc.range())
-          << "iterator expression is expected to be a list, iterable, or range, found "
-          << (siv ? siv->getValue()->type()->python_str() : siv->kind());
+          << "iterator expression is expected to be a list, iterable, or "
+             "range, found "
+          << sv->kind();
     }
 
     // given `[x*2 for x in my_list]` this generates the following AST:

From 111da779125577e254eec7f45d272a8f14a8d550 Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <cc.rafaz@zafar.cc>
Date: Thu, 3 Oct 2019 11:46:53 -0700
Subject: [PATCH 017/113] Factored out the default mappings

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27164

Test Plan: Imported from OSS

Differential Revision: D17694475

Pulled By: zafartahirov

fbshipit-source-id: df8df5f7d66062ed35da957064a31344e1d3c961
---
 test/common_quantization.py            |   3 +-
 torch/quantization/__init__.py         |   3 +-
 torch/quantization/default_mappings.py |  59 +++++++++++
 torch/quantization/quantize.py         | 133 ++++++-------------------
 torch/quantization/stubs.py            |  54 ++++++++++
 5 files changed, 146 insertions(+), 106 deletions(-)
 create mode 100644 torch/quantization/default_mappings.py
 create mode 100644 torch/quantization/stubs.py

diff --git a/test/common_quantization.py b/test/common_quantization.py
index 08bd68f9869fe..485d454320a7d 100644
--- a/test/common_quantization.py
+++ b/test/common_quantization.py
@@ -15,7 +15,8 @@
 from common_utils import TestCase
 from torch.quantization import QuantWrapper, QuantStub, DeQuantStub, \
     default_qconfig, default_per_channel_qconfig, QConfig, default_observer, default_weight_observer, \
-    propagate_qconfig_, convert, DEFAULT_DYNAMIC_MODULE_MAPPING
+    propagate_qconfig_, convert
+from torch.quantization.default_mappings import DEFAULT_DYNAMIC_MODULE_MAPPING
 
 def test_only_eval_fn(model, calib_data):
     r"""
diff --git a/torch/quantization/__init__.py b/torch/quantization/__init__.py
index ad498381f2143..724e6e0ca90b6 100644
--- a/torch/quantization/__init__.py
+++ b/torch/quantization/__init__.py
@@ -4,6 +4,7 @@
 from .QConfig import *  # noqa: F401
 from .fake_quantize import *  # noqa: F401
 from .fuse_modules import fuse_modules  # noqa: F401
+from .stubs import *  # noqa: F401
 
 def default_eval_fn(model, calib_data):
     r"""
@@ -14,7 +15,7 @@ def default_eval_fn(model, calib_data):
         model(data)
 
 _all__ = [
-    'QuantWrapper', 'QuantStub', 'DeQuantStub', 'DEFAULT_MODULE_MAPPING',
+    'QuantWrapper', 'QuantStub', 'DeQuantStub',
     # Top level API for eager mode quantization
     'quantize',
     # Sub functions used by eager mode quantization
diff --git a/torch/quantization/default_mappings.py b/torch/quantization/default_mappings.py
new file mode 100644
index 0000000000000..b5fb1b017f2d0
--- /dev/null
+++ b/torch/quantization/default_mappings.py
@@ -0,0 +1,59 @@
+
+from torch import nn
+
+import torch.nn.intrinsic as nni
+import torch.nn.intrinsic.quantized as nniq
+import torch.nn.intrinsic.qat as nniqat
+import torch.nn.quantized as nnq
+import torch.nn.quantized.dynamic as nnqd
+import torch.nn.qat as nnqat
+
+from .stubs import QuantStub, DeQuantStub
+
+# Map for swapping float module to quantized ones
+DEFAULT_MODULE_MAPPING = {
+    nn.Linear: nnq.Linear,
+    nn.ReLU: nnq.ReLU,
+    nn.Conv2d: nnq.Conv2d,
+    QuantStub: nnq.Quantize,
+    DeQuantStub: nnq.DeQuantize,
+    # Wrapper Modules:
+    nnq.FloatFunctional: nnq.QFunctional,
+    # Intrinsic modules:
+    nni.ConvReLU2d: nniq.ConvReLU2d,
+    nni.LinearReLU: nniq.LinearReLU,
+    nniqat.ConvReLU2d: nniq.ConvReLU2d,
+    nniqat.LinearReLU: nniq.LinearReLU,
+    nniqat.ConvBn2d: nnq.Conv2d,
+    nniqat.ConvBnReLU2d: nniq.ConvReLU2d,
+    # QAT modules:
+    nnqat.Linear: nnq.Linear,
+    nnqat.Conv2d: nnq.Conv2d,
+}
+
+# Map for swapping float module to qat modules
+DEFAULT_QAT_MODULE_MAPPING = {
+    nn.Linear: nnqat.Linear,
+    nn.Conv2d: nnqat.Conv2d,
+    # Intrinsic modules:
+    nni.ConvBn2d: nniqat.ConvBn2d,
+    nni.ConvBnReLU2d: nniqat.ConvBnReLU2d,
+    nni.ConvReLU2d: nniqat.ConvReLU2d,
+    nni.LinearReLU: nniqat.LinearReLU
+}
+
+# Map for swapping dynamic modules
+DEFAULT_DYNAMIC_MODULE_MAPPING = {
+    nn.Linear: nnqd.Linear,
+    nn.LSTM: nnqd.LSTM,
+}
+
+# List of modules to skip the qconfig propagation
+DEFAULT_SKIP_LIST = (
+    nn.Dropout,
+    nn.Identity,
+    nn.MaxPool2d,
+    nn.AvgPool2d,
+    nn.AdaptiveAvgPool2d,
+    DeQuantStub
+)
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 85403b71a3063..fed6b57a2d06a 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -2,46 +2,22 @@
 
 import copy
 import itertools
+import warnings
+
 import torch
 import torch.nn as nn
 import torch.nn.intrinsic as nni
-import torch.nn.intrinsic.quantized as nniq
-import torch.nn.intrinsic.qat as nniqat
 import torch.nn.quantized as nnq
-import torch.nn.quantized.dynamic as nnqd
-from .QConfig import default_dynamic_qconfig, float16_dynamic_qconfig
-import torch.nn.qat as nnqat
-import warnings
-
-class QuantStub(nn.Module):
-    r"""Quantize stub module, before calibration, this is same as an observer,
-    it will be swapped as `nnq.Quantize` in `convert`.
-
-    Args:
-        qconfig: quantization configuration for the tensor,
-            if qconfig is not provided, we will get qconfig from parent modules
-    """
-    def __init__(self, qconfig=None):
-        super(QuantStub, self).__init__()
-        if qconfig:
-            self.qconfig = qconfig
-
-    def forward(self, x):
-        return x
-
-class DeQuantStub(nn.Module):
-    r"""Dequantize stub module, before calibration, this is same as identity,
-    this will be swapped as `nnq.DeQuantize` in `convert`.
-    """
-    def __init__(self):
-        super(DeQuantStub, self).__init__()
 
-    def forward(self, x):
-        return x
-
-DEFAULT_SKIP_LIST = [nn.Dropout, nn.Identity, nn.MaxPool2d, nn.AvgPool2d, nn.AdaptiveAvgPool2d, DeQuantStub]
+from .default_mappings import (DEFAULT_DYNAMIC_MODULE_MAPPING,
+                               DEFAULT_MODULE_MAPPING,
+                               DEFAULT_QAT_MODULE_MAPPING,
+                               DEFAULT_SKIP_LIST)
+from .stubs import DeQuantStub, QuantWrapper
+from .QConfig import default_dynamic_qconfig, float16_dynamic_qconfig
 
-def _propagate_qconfig_helper(module, qconfig_dict, skip_list=DEFAULT_SKIP_LIST, qconfig_parent=None, prefix=''):
+def _propagate_qconfig_helper(module, qconfig_dict, skip_list=None,
+                              qconfig_parent=None, prefix=''):
     r"""This is a helper function for `propagate_qconfig_`
 
     Args:
@@ -57,6 +33,8 @@ def _propagate_qconfig_helper(module, qconfig_dict, skip_list=DEFAULT_SKIP_LIST,
     Return:
         None, module is modified inplace with qconfig attached
     """
+    if skip_list is None:
+        skip_list = DEFAULT_SKIP_LIST
     if type(module) in skip_list:
         module.qconfig = None
     if not hasattr(module, 'qconfig'):
@@ -77,7 +55,8 @@ def _propagate_qconfig_helper(module, qconfig_dict, skip_list=DEFAULT_SKIP_LIST,
 
     for name, child in module.named_children():
         module_prefix = prefix + '.' + name if prefix else name
-        _propagate_qconfig_helper(child, qconfig_dict, skip_list, module.qconfig, module_prefix)
+        _propagate_qconfig_helper(child, qconfig_dict, skip_list,
+                                  module.qconfig, module_prefix)
 
 # TODO(jerryzh): expose skip_list
 def propagate_qconfig_(module, qconfig_dict=None):
@@ -86,10 +65,10 @@ def propagate_qconfig_(module, qconfig_dict=None):
 
     Args:
         module: input module
-        qconfig_dict: dictionary that maps from name or type of submodule to quantization
-            configuration, qconfig applies to all submodules of a given
-            module unless qconfig for the submodules are specified (when the
-            submodule already has qconfig attribute)
+        qconfig_dict: dictionary that maps from name or type of submodule to
+            quantization configuration, qconfig applies to all submodules of a
+            given module unless qconfig for the submodules are specified (when
+            the submodule already has qconfig attribute)
 
     Return:
         None, module is modified inplace with qconfig attached
@@ -132,30 +111,6 @@ def add_observer_(module):
         module.add_module('observer', module.qconfig.activation())
         module.register_forward_hook(_observer_forward_hook)
 
-class QuantWrapper(nn.Module):
-    r"""A wrapper class that wraps the input module, adds QuantStub and
-    DeQuantStub and surround the call to module with call to quant and dequant
-    modules.
-
-    This is used by the `quantization` utility functions to add the quant and
-    dequant modules, before `convert` function `QuantStub` will just be observer,
-    it observes the input tensor, after `convert`, `QuantStub`
-    will be swapped to `nnq.Quantize` which does actual quantization. Similarly
-    for `DeQuantStub`.
-    """
-    def __init__(self, module):
-        super(QuantWrapper, self).__init__()
-        qconfig = module.qconfig if hasattr(module, 'qconfig') else None
-        self.add_module('quant', QuantStub(qconfig))
-        self.add_module('dequant', DeQuantStub())
-        self.add_module('module', module)
-        self.train(module.training)
-
-    def forward(self, X):
-        X = self.quant(X)
-        X = self.module(X)
-        return self.dequant(X)
-
 def add_quant_dequant(module):
     r"""Wrap the leaf child module in QuantWrapper if it has a valid qconfig
     Note that this function will modify the children of module inplace and it
@@ -206,44 +161,7 @@ def prepare(model, qconfig_dict=None, inplace=False):
     add_observer_(model)
     return model
 
-# Map for swapping float module to quantized ones
-DEFAULT_MODULE_MAPPING = {
-    nn.Linear: nnq.Linear,
-    nn.ReLU: nnq.ReLU,
-    nn.Conv2d: nnq.Conv2d,
-    QuantStub: nnq.Quantize,
-    DeQuantStub: nnq.DeQuantize,
-    # Wrapper Modules:
-    nnq.FloatFunctional: nnq.QFunctional,
-    # Intrinsic modules:
-    nni.ConvReLU2d: nniq.ConvReLU2d,
-    nni.LinearReLU: nniq.LinearReLU,
-    nniqat.ConvReLU2d: nniq.ConvReLU2d,
-    nniqat.LinearReLU: nniq.LinearReLU,
-    nniqat.ConvBn2d: nnq.Conv2d,
-    nniqat.ConvBnReLU2d: nniq.ConvReLU2d,
-    # QAT modules:
-    nnqat.Linear: nnq.Linear,
-    nnqat.Conv2d: nnq.Conv2d,
-}
-
-# Map for swapping float module to qat modules
-DEFAULT_QAT_MODULE_MAPPING = {
-    nn.Linear: nnqat.Linear,
-    nn.Conv2d: nnqat.Conv2d,
-    # Intrinsic modules:
-    nni.ConvBn2d: nniqat.ConvBn2d,
-    nni.ConvBnReLU2d: nniqat.ConvBnReLU2d,
-    nni.ConvReLU2d: nniqat.ConvReLU2d,
-    nni.LinearReLU: nniqat.LinearReLU
-}
-
-DEFAULT_DYNAMIC_MODULE_MAPPING = {
-    nn.Linear: nnqd.Linear,
-    nn.LSTM: nnqd.LSTM,
-}
-
-def quantize(model, run_fn, run_args, mapping=DEFAULT_MODULE_MAPPING, inplace=False):
+def quantize(model, run_fn, run_args, mapping=None, inplace=False):
     r"""Converts a float model to quantized model.
 
     First it will prepare the model for calibration or training, then it calls
@@ -262,7 +180,8 @@ def quantize(model, run_fn, run_args, mapping=DEFAULT_MODULE_MAPPING, inplace=Fa
     Return:
         Quantized model.
     """
-
+    if mapping is None:
+        mapping = DEFAULT_MODULE_MAPPING
     if not inplace:
         model = copy.deepcopy(model)
     model.eval()
@@ -271,7 +190,8 @@ def quantize(model, run_fn, run_args, mapping=DEFAULT_MODULE_MAPPING, inplace=Fa
     convert(model, mapping, inplace=True)
     return model
 
-def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, mapping=DEFAULT_DYNAMIC_MODULE_MAPPING, inplace=False):
+def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8,
+                     mapping=None, inplace=False):
     r"""Converts a float model to dynamic (i.e. weights-only) quantized model.
 
     Replaces specified modules with dynamic weight-only quantized versions and output the quantized model.
@@ -320,6 +240,9 @@ def quantize_dynamic(model, qconfig_spec=None, dtype=torch.qint8, mapping=DEFAUL
             raise RuntimeError('Unknown dtype specified for quantize_dynamic: ', str(dtype))
         qconfig_spec = dict(zip(qconfig_spec, itertools.repeat(default_qconfig)))
 
+    if mapping is None:
+        mapping = DEFAULT_DYNAMIC_MODULE_MAPPING
+
     if not inplace:
         model = copy.deepcopy(model)
     model.eval()
@@ -352,7 +275,7 @@ def quantize_qat(model, run_fn, run_args, inplace=False):
     convert(model, inplace=True)
     return model
 
-def convert(module, mapping=DEFAULT_MODULE_MAPPING, inplace=False):
+def convert(module, mapping=None, inplace=False):
     r"""Converts the float module with observers (where we can get quantization
     parameters) to a quantized module.
     Args:
@@ -361,6 +284,8 @@ def convert(module, mapping=DEFAULT_MODULE_MAPPING, inplace=False):
            module type, can be overwrritten to allow swapping user defined Modules
         inplace: carry out model transformations in-place, the original module is mutated
     """
+    if mapping is None:
+        mapping = DEFAULT_MODULE_MAPPING
     if not inplace:
         module = copy.deepcopy(module)
     reassign = {}
diff --git a/torch/quantization/stubs.py b/torch/quantization/stubs.py
new file mode 100644
index 0000000000000..7018ef1b836da
--- /dev/null
+++ b/torch/quantization/stubs.py
@@ -0,0 +1,54 @@
+
+from torch import nn
+
+class QuantStub(nn.Module):
+    r"""Quantize stub module, before calibration, this is same as an observer,
+    it will be swapped as `nnq.Quantize` in `convert`.
+
+    Args:
+        qconfig: quantization configuration for the tensor,
+            if qconfig is not provided, we will get qconfig from parent modules
+    """
+    def __init__(self, qconfig=None):
+        super(QuantStub, self).__init__()
+        if qconfig:
+            self.qconfig = qconfig
+
+    def forward(self, x):
+        return x
+
+
+class DeQuantStub(nn.Module):
+    r"""Dequantize stub module, before calibration, this is same as identity,
+    this will be swapped as `nnq.DeQuantize` in `convert`.
+    """
+    def __init__(self):
+        super(DeQuantStub, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class QuantWrapper(nn.Module):
+    r"""A wrapper class that wraps the input module, adds QuantStub and
+    DeQuantStub and surround the call to module with call to quant and dequant
+    modules.
+
+    This is used by the `quantization` utility functions to add the quant and
+    dequant modules, before `convert` function `QuantStub` will just be observer,
+    it observes the input tensor, after `convert`, `QuantStub`
+    will be swapped to `nnq.Quantize` which does actual quantization. Similarly
+    for `DeQuantStub`.
+    """
+    def __init__(self, module):
+        super(QuantWrapper, self).__init__()
+        qconfig = module.qconfig if hasattr(module, 'qconfig') else None
+        self.add_module('quant', QuantStub(qconfig))
+        self.add_module('dequant', DeQuantStub())
+        self.add_module('module', module)
+        self.train(module.training)
+
+    def forward(self, X):
+        X = self.quant(X)
+        X = self.module(X)
+        return self.dequant(X)

From 7b2e8c323c97f12cff402b347e59541ad67d75cb Mon Sep 17 00:00:00 2001
From: Vitaly Fedyunin <vitalyf@fb.com>
Date: Thu, 3 Oct 2019 12:04:42 -0700
Subject: [PATCH 018/113] Add memory format argument to the `clone` operator
 (#27106)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27106

Adds memory_format option to the `clone` operator.

Introduce new `clone` behavior if used with `input_t.clone(memory_format=torch.preserve_format)`:
1) If tensor is non-overlapping and dense - output tensor will have the same strides as input tensor.
2) If not (1) and tensor is stored in the channels last format, output tensor going to have channels last format.
3) Output tensor is going to be contiguous in all other cases.

 ---
Dense tensor is the tensor that store values in a contiguous block of memory.
Non-overlapping tensor is the tensor in which elements occupy individual non-repetitive memory.

Test Plan: Imported from OSS

Differential Revision: D17699357

Pulled By: VitalyFedyunin

fbshipit-source-id: 5ae1537c2aca1abf0bf1eec4416846129c156f66
---
 aten/src/ATen/core/OpsAlreadyMovedToC10.cpp  |  2 +-
 aten/src/ATen/core/TensorBody.h              |  7 ++++-
 aten/src/ATen/core/TensorMethods.h           | 13 ++++-----
 aten/src/ATen/native/TensorFactories.cpp     | 16 +++++++++--
 aten/src/ATen/native/mkldnn/TensorShape.cpp  |  8 ++++--
 aten/src/ATen/native/native_functions.yaml   |  3 +-
 aten/src/ATen/native/quantized/QTensor.cpp   | 20 +++++++++++--
 aten/src/ATen/native/sparse/SparseTensor.cpp |  6 +++-
 aten/src/ATen/templates/TensorBody.h         |  5 ++++
 c10/core/TensorImpl.cpp                      | 29 +++++++++++++++++++
 c10/core/TensorImpl.h                        | 18 ++++++++++++
 test/test_torch.py                           | 30 ++++++++++++++++++++
 tools/autograd/derivatives.yaml              |  2 +-
 torch/csrc/jit/passes/shape_analysis.cpp     |  2 +-
 torch/onnx/symbolic_opset9.py                | 12 ++++----
 15 files changed, 147 insertions(+), 26 deletions(-)

diff --git a/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp b/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
index d74609e0e988b..f08f3321ccf57 100644
--- a/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
+++ b/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
@@ -419,7 +419,6 @@ bool aten_op_is_already_moved_to_c10(const c10::OperatorName& opName) {
         {"aten::frobenius_norm", "dim"},
         {"aten::nuclear_norm", ""},
         {"aten::nuclear_norm", "dim"},
-        {"aten::clone", ""},
         {"aten::resize_as_", ""},
         {"aten::pow", "Tensor_Scalar"},
         {"aten::zero_", ""},
@@ -1201,6 +1200,7 @@ bool aten_op_is_not_moved_to_c10_yet(const c10::OperatorName& opName) {
         {"aten::frobenius_norm", "out"},
         {"aten::nuclear_norm", "out"},
         {"aten::nuclear_norm", "dim_out"},
+        {"aten::clone", ""},
         {"aten::pow", "Tensor_Scalar_out"},
         {"aten::sub", "out"},
         {"aten::addmm", "out"},
diff --git a/aten/src/ATen/core/TensorBody.h b/aten/src/ATen/core/TensorBody.h
index bdda41eff145d..9de737b527314 100644
--- a/aten/src/ATen/core/TensorBody.h
+++ b/aten/src/ATen/core/TensorBody.h
@@ -186,10 +186,15 @@ class CAFFE2_API Tensor {
   int64_t ndimension() const {
     return dim();
   }
+
   bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
     return impl_->is_contiguous(memory_format);
   }
 
+  bool is_non_overlapping_and_dense() const {
+    return impl_->is_non_overlapping_and_dense();
+  }
+
   at::MemoryFormat suggest_memory_format() const {
     if (impl_->is_strides_like_channels_last()) {
       return at::MemoryFormat::ChannelsLast;
@@ -734,7 +739,7 @@ class CAFFE2_API Tensor {
   #ifdef BUILD_NAMEDTENSOR
   Tensor norm(c10::optional<Scalar> p, DimnameList dim, bool keepdim=false) const;
   #endif
-  Tensor clone() const;
+  Tensor clone(c10::optional<MemoryFormat> memory_format=c10::nullopt) const;
   Tensor & resize_as_(const Tensor & the_template) const;
   Tensor pow(Scalar exponent) const;
   Tensor & zero_() const;
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index ae225c014ccdd..e5e3de59248bc 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -3044,26 +3044,25 @@ inline Tensor Tensor::norm(c10::optional<Scalar> p, DimnameList dim, bool keepdi
 #endif
 }
 #endif
-inline Tensor Tensor::clone() const {
+inline Tensor Tensor::clone(c10::optional<MemoryFormat> memory_format) const {
 #ifdef USE_STATIC_DISPATCH
     at::AutoNonVariableTypeMode _var_guard(true);
     switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
         case Backend::CPU:
-            return CPUType::clone(const_cast<Tensor&>(*this));
+            return CPUType::clone(const_cast<Tensor&>(*this), memory_format);
             break;
         case Backend::QuantizedCPU:
-            return QuantizedCPUType::clone(const_cast<Tensor&>(*this));
+            return QuantizedCPUType::clone(const_cast<Tensor&>(*this), memory_format);
             break;
         case Backend::SparseCPU:
-            return SparseCPUType::clone(const_cast<Tensor&>(*this));
+            return SparseCPUType::clone(const_cast<Tensor&>(*this), memory_format);
             break;
         default:
             AT_ERROR("clone not implemented for ", at::toString(type_set()));
     }
 #else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::clone", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
+    static auto table = globalATenDispatch().getOpTable("aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor");
+    return table->callUnboxed<Tensor, const Tensor &, c10::optional<MemoryFormat>>(const_cast<Tensor&>(*this), memory_format);
 #endif
 }
 inline Tensor & Tensor::resize_as_(const Tensor & the_template) const {
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index cb965b6f77959..0a1d7422a04b7 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -867,8 +867,20 @@ Tensor from_file(std::string filename, c10::optional<bool> shared, c10::optional
 
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ clone ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Tensor clone(const Tensor& src) {
-  auto self = at::empty_like(src);
+Tensor clone(const Tensor& src, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  auto memory_format =
+      optional_memory_format.value_or(MemoryFormat::Contiguous);
+  if (memory_format == MemoryFormat::Preserve) {
+    if (src.is_non_overlapping_and_dense()) {
+      // Copy all strides
+      auto self = at::empty_strided(src.sizes(), src.strides(), src.options());
+      self.copy_(src);
+      return self;
+    } else {
+      memory_format = src.suggest_memory_format();
+    }
+  }
+  auto self = at::empty_like(src, src.options(), memory_format);
   self.copy_(src);
   return self;
 }
diff --git a/aten/src/ATen/native/mkldnn/TensorShape.cpp b/aten/src/ATen/native/mkldnn/TensorShape.cpp
index 1567df04eca67..b45784dc2e96a 100644
--- a/aten/src/ATen/native/mkldnn/TensorShape.cpp
+++ b/aten/src/ATen/native/mkldnn/TensorShape.cpp
@@ -16,7 +16,7 @@ Tensor mkldnn_reshape(const Tensor& self, IntArrayRef size) {
   AT_ERROR("mkldnn_reshape: ATen not compiled with MKLDNN support");
 }
 
-Tensor mkldnn_clone(const Tensor& self) {
+Tensor mkldnn_clone(const Tensor& self, c10::optional<c10::MemoryFormat> optional_memory_format) {
   AT_ERROR("mkldnn_clone: ATen not compiled with MKLDNN support");
 }
 
@@ -54,7 +54,11 @@ Tensor mkldnn_reshape(const Tensor& self, IntArrayRef size) {
   return new_with_itensor_mkldnn(std::move(y), self.options());
 }
 
-Tensor mkldnn_clone(const Tensor& self) {
+Tensor mkldnn_clone(const Tensor& self, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  TORCH_CHECK(
+      !optional_memory_format.has_value(),
+      "unsupported memory format option ",
+      optional_memory_format.value());
   ideep::tensor& src = itensor_from_mkldnn(self);
   ideep::tensor dst;
   ideep::direct_copy::compute<AllocForMKLDNN>(src, dst);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 5c738c9bc282f..8d37c05ae69af 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3108,8 +3108,7 @@
 - func: nuclear_norm.dim_out(Tensor self, int[2] dim, bool keepdim=False, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
 
-- func: clone(Tensor self) -> Tensor
-  use_c10_dispatcher: full
+- func: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
   variants: function, method
   dispatch:
     CPU: clone
diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp
index e3deeac81362a..0301868befb24 100644
--- a/aten/src/ATen/native/quantized/QTensor.cpp
+++ b/aten/src/ATen/native/quantized/QTensor.cpp
@@ -155,13 +155,29 @@ Tensor& set_quantizer_(Tensor& self, ConstQuantizerPtr quantizer) {
   return self;
 }
 
-Tensor quantized_clone(const Tensor& self) {
+Tensor quantized_clone(const Tensor& self, c10::optional<c10::MemoryFormat> optional_memory_format) {
   // TODO: add per channel support
   TORCH_INTERNAL_ASSERT(
       self.qscheme() == at::kPerTensorAffine,
       "clone for quantized Tensor only works for PerTensorAffine scheme right now");
+
+  auto memory_format =
+      optional_memory_format.value_or(MemoryFormat::Contiguous);
+
+  // TODO: To support all features of MemoryFormat::Preserve we need to add
+  // _empty_affine_quantized_strided function and use it similarly to
+  // Tensor clone(const Tensor& src, c10::optional<c10::MemoryFormat> optional_memory_format)
+  // if (self.is_non_overlapping_and_dense()) -> _empty_affine_quantized_strided
+  if (memory_format == MemoryFormat::Preserve) {
+    memory_format = self.suggest_memory_format();
+  }
+
   Tensor dst = at::_empty_affine_quantized(
-      self.sizes(), self.options(), self.q_scale(), self.q_zero_point());
+      self.sizes(),
+      self.options(),
+      self.q_scale(),
+      self.q_zero_point(),
+      memory_format);
 
   at::native::copy_(dst, self, false);
 
diff --git a/aten/src/ATen/native/sparse/SparseTensor.cpp b/aten/src/ATen/native/sparse/SparseTensor.cpp
index 418f403da7e47..7185acbe666ff 100644
--- a/aten/src/ATen/native/sparse/SparseTensor.cpp
+++ b/aten/src/ATen/native/sparse/SparseTensor.cpp
@@ -255,7 +255,11 @@ Tensor _sparse_coo_tensor_unsafe(const Tensor& indices, const Tensor& values_, A
 
 // NB: Deleted newWithSizeNd variants
 
-SparseTensor clone_sparse(const SparseTensor& self) {
+SparseTensor clone_sparse(const SparseTensor& self, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  TORCH_CHECK(
+      !optional_memory_format.has_value(),
+      "unsupported memory format option ",
+      optional_memory_format.value());
   SparseTensor other = new_with_dims_sparse(self.sparse_dim(), self.dense_dim(), self.sizes(), self.options());
   copy_into_sparse(other, self._indices(), self._values(), true);
   return other._coalesced_(self.is_coalesced());
diff --git a/aten/src/ATen/templates/TensorBody.h b/aten/src/ATen/templates/TensorBody.h
index dc7273d126128..f8a0e6f2aa4a6 100644
--- a/aten/src/ATen/templates/TensorBody.h
+++ b/aten/src/ATen/templates/TensorBody.h
@@ -186,10 +186,15 @@ class CAFFE2_API Tensor {
   int64_t ndimension() const {
     return dim();
   }
+
   bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
     return impl_->is_contiguous(memory_format);
   }
 
+  bool is_non_overlapping_and_dense() const {
+    return impl_->is_non_overlapping_and_dense();
+  }
+
   at::MemoryFormat suggest_memory_format() const {
     if (impl_->is_strides_like_channels_last()) {
       return at::MemoryFormat::ChannelsLast;
diff --git a/c10/core/TensorImpl.cpp b/c10/core/TensorImpl.cpp
index dd829a8563766..ac90791ccc452 100644
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@@ -130,6 +130,35 @@ bool TensorImpl::compute_strides_like_channels_last() const {
   return false;
 }
 
+bool TensorImpl::compute_non_overlapping_and_dense() const {
+  if (dim() == 1) {
+    return size(0) < 2 || stride(0) == 1;
+  }
+  SmallVector<int64_t,5> perm;
+  perm.resize(dim());
+  for (int64_t i = 0; i < dim(); i ++) {
+    perm[i] = i;
+  }
+  // Sort by strides, leaving 0 and 1 sized dims at the end of the array
+  std::sort(perm.begin(), perm.end(), [&](int64_t a, int64_t b) {
+      if (sizes_[a] < 2) {
+        return false;
+      }
+      return strides_[a] < strides_[b];
+  });
+  auto require_stride = 1;
+  for (int64_t i = 0; i < dim(); i ++) {
+    if (sizes_[perm[i]] < 2) {
+      return true;
+    }
+    if (strides_[perm[i]] != require_stride) {
+      return false;
+    }
+    require_stride *= sizes_[perm[i]];
+  }
+  return true;
+}
+
 void TensorImpl::release_resources() {
   autograd_meta_.reset();
   if (storage_) {
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 366d2b7cbe6a0..4e8dfe8bfc7bd 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -1390,6 +1390,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     is_contiguous_ = false;
     is_channels_last_contiguous_ = false;
     is_channels_last_ = false;
+    is_non_overlapping_and_dense_ = false;
     switch (memory_format) {
       case MemoryFormat::Contiguous: {
         strides_.resize(sizes_.size(), 0);
@@ -1401,6 +1402,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
           }
         }
         is_contiguous_ = true;
+        is_non_overlapping_and_dense_ = true;
         return;
       }
       case MemoryFormat::ChannelsLast: {
@@ -1410,6 +1412,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
         set_sizes_and_strides(sizes(), get_channels_last_strides(sizes()));
         is_channels_last_contiguous_ = true;
         is_channels_last_ = true;
+        is_non_overlapping_and_dense_ = true;
         return;
       }
       case MemoryFormat::Preserve:
@@ -1421,6 +1424,10 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     return is_channels_last_;
   }
 
+  bool is_non_overlapping_and_dense() const {
+    return is_non_overlapping_and_dense_;
+  }
+
 private:
 
   // The Caffe2 Resize() method supports being called both as Resize({2,2}) as
@@ -1501,6 +1508,8 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
 
   bool compute_strides_like_channels_last() const;
 
+  bool compute_non_overlapping_and_dense() const;
+
 protected:
   /**
    * Recompute the cached numel of a tensor.  Call this if you modify sizes.
@@ -1517,6 +1526,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
     is_contiguous_ = compute_contiguous();
     is_channels_last_contiguous_ = compute_channels_last_contiguous();
     is_channels_last_ = is_channels_last_contiguous_ || compute_strides_like_channels_last();
+    is_non_overlapping_and_dense_ = is_contiguous_ || is_channels_last_contiguous_ || compute_non_overlapping_and_dense();
   }
 
   /**
@@ -1546,6 +1556,9 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
       dest_impl->type_set_ = dest_impl->type_set_.remove(TensorTypeId::VariableTensorId);
     }
     dest_impl->is_contiguous_ = src_impl->is_contiguous_;
+    dest_impl->is_channels_last_contiguous_ = src_impl->is_channels_last_contiguous_;
+    dest_impl->is_channels_last_ = src_impl->is_channels_last_;
+    dest_impl->is_non_overlapping_and_dense_ = src_impl->is_non_overlapping_and_dense_;
     dest_impl->is_wrapped_number_ = src_impl->is_wrapped_number_;
     dest_impl->reserved_ = src_impl->reserved_;
     dest_impl->set_version_counter(version_counter);
@@ -1641,6 +1654,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
   // contiguous memory block.
   bool is_channels_last_contiguous_ = false;
 
+  // Dense tensor is the tensor that store values in a contiguous block of memory.
+  // Non-overlapping tensor is the tensor in which elements occupy individual
+  // non-repetitive memory.
+  bool is_non_overlapping_and_dense_ = false;
+
   bool is_wrapped_number_ = false;
 
   // NOTE [ Metadata Change for a Detached Tensor ]
diff --git a/test/test_torch.py b/test/test_torch.py
index 0f6dd380354af..5fa75cd631882 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -12402,6 +12402,36 @@ def test_memory_format_preserved_after_permute(self, device):
         y = nhwc.permute(0, 1, 3, 2).permute(0, 1, 3, 2)
         self.assertTrue(y.is_contiguous(memory_format=torch.channels_last))
 
+    def test_memory_format_clone(self, device):
+        nhwc = torch.randn((10, 3, 32, 32), device=device).contiguous(memory_format=torch.channels_last)
+        # nhwc is not memory dense, but looks like channels last
+        nhwc = nhwc[:, :, ::2, ::2]
+        clone = nhwc.clone(memory_format=torch.preserve_format)
+        self.assertFalse(clone.is_contiguous())
+        self.assertTrue(clone.is_contiguous(memory_format=torch.channels_last))
+        self.assertFalse(nhwc.is_contiguous())
+        self.assertFalse(nhwc.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(nhwc, clone)
+
+        nhwc = torch.randn((10, 3, 32, 32), device=device).contiguous(memory_format=torch.channels_last)
+        clone = nhwc.clone(memory_format=torch.contiguous_format)
+        self.assertTrue(clone.is_contiguous())
+        self.assertFalse(clone.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(nhwc, clone)
+
+        nhwc = torch.randn((10, 3, 32, 32), device=device).contiguous(memory_format=torch.channels_last)
+        clone = nhwc.clone()
+        self.assertTrue(clone.is_contiguous())
+        self.assertFalse(clone.is_contiguous(memory_format=torch.channels_last))
+        self.assertEqual(nhwc, clone)
+
+        x = torch.randn((3, 4, 5, 6, 7, 8, 9), device=device)
+        for _ in range(10):
+            permutation = list(range(len(x.shape)))
+            random.shuffle(permutation)
+            x = x.permute(permutation)
+            self.assertEqual(x.stride(), x.clone(memory_format=torch.preserve_format).stride())
+
     def test_memory_format_empty_like(self, device):
         x = torch.randn(10, 3, 32, 32, device=device)
         nhwc = x.contiguous(memory_format=torch.channels_last)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 7a975bdb1b89f..c0d253e68ce5a 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -233,7 +233,7 @@
 - name: clamp_max(Tensor self, Scalar max) -> Tensor
   self: grad * (self <= max).to(grad.dtype())
 
-- name: clone(Tensor self) -> Tensor
+- name: clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor
   self: grad
 
 - name: coalesce(Tensor self) -> Tensor
diff --git a/torch/csrc/jit/passes/shape_analysis.cpp b/torch/csrc/jit/passes/shape_analysis.cpp
index 0eb66b0b1faa3..813ae711b967b 100644
--- a/torch/csrc/jit/passes/shape_analysis.cpp
+++ b/torch/csrc/jit/passes/shape_analysis.cpp
@@ -791,7 +791,7 @@ class ShapePropagator {
             "aten::asin(Tensor self) -> Tensor",
             "aten::atan(Tensor self) -> Tensor",
             "aten::ceil(Tensor self) -> Tensor",
-            "aten::clone(Tensor self) -> Tensor",
+            "aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor",
             "aten::contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor",
             "aten::bernoulli(Tensor self, *, Generator? generator) -> Tensor",
             "aten::celu(Tensor self, Scalar alpha) -> Tensor",
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 7251d86095e75..4177f1a5567ac 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -972,7 +972,7 @@ def index_select(g, self, dim, index):
         index = g.op("Constant", value_t=torch.LongTensor([index_const]))
     elif index_dim is not None:
         if index_dim == 0:
-            # Index is a scalar. Reshape it to a size 1 tensor. 
+            # Index is a scalar. Reshape it to a size 1 tensor.
             index = g.op("Reshape", index, g.op("Constant", value_t=torch.LongTensor([1])))
     return g.op("Gather", self, index, axis_i=dim)
 
@@ -1042,7 +1042,7 @@ def cosine_similarity(g, x1, x2, dim, eps):
 
 
 # ignore clone operators that are inserted by PyTorch autograd
-def clone(g, input):
+def clone(g, input, unused_memory_format):
     return input
 
 
@@ -1950,10 +1950,10 @@ def multinomial(g, input, num_samples, replacement=False, generator=None):
                 dtype_i=sym_help.cast_pytorch_to_onnx['Long'],
                 sample_size_i=num_samples)
 
-def baddbmm(g, self, batch1, batch2, beta, alpha):  
-    dtype = self.type().scalarType()    
-    batch_mul = matmul(g, batch1, batch2)       
-    mul_a = mul(g, batch_mul, g.op("Cast", alpha, to_i=sym_help.cast_pytorch_to_onnx[dtype]))       
+def baddbmm(g, self, batch1, batch2, beta, alpha):
+    dtype = self.type().scalarType()
+    batch_mul = matmul(g, batch1, batch2)
+    mul_a = mul(g, batch_mul, g.op("Cast", alpha, to_i=sym_help.cast_pytorch_to_onnx[dtype]))
     mul_b = mul(g, self, g.op("Cast", beta, to_i=sym_help.cast_pytorch_to_onnx[dtype]))
     return add(g, mul_a, mul_b)
 

From 493c90081008b193fb865b96671067f801c3cdcb Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Thu, 3 Oct 2019 12:05:33 -0700
Subject: [PATCH 019/113] Extract version to version.txt (#27149)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27149

Extract version to version.txt and add reading version logic to setup.py and fb/torch_version.py
ghstack-source-id: 91271883

Test Plan: N/A

Reviewed By: gchanan, ezyang

Differential Revision: D17689307

fbshipit-source-id: 21899502027cec71b63d9dc151e09ff5ff3f279d
---
 setup.py    | 2 +-
 version.txt | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 100644 version.txt

diff --git a/setup.py b/setup.py
index 0e457541ebfe3..ad94b94eb21e9 100644
--- a/setup.py
+++ b/setup.py
@@ -257,7 +257,7 @@ def report(*args):
 # Version, create_version_file, and package_name
 ################################################################################
 package_name = os.getenv('TORCH_PACKAGE_NAME', 'torch')
-version = '1.3.0a0'
+version = open('version.txt', 'r').read().strip()
 sha = 'Unknown'
 
 try:
diff --git a/version.txt b/version.txt
new file mode 100644
index 0000000000000..c4150fa5bb510
--- /dev/null
+++ b/version.txt
@@ -0,0 +1 @@
+1.3.0a0

From 7bd7a3d80639b4010bd9ef6f8ab86ed22342201b Mon Sep 17 00:00:00 2001
From: Jiakai Liu <liujiakai@fb.com>
Date: Thu, 3 Oct 2019 12:21:48 -0700
Subject: [PATCH 020/113] add AutoNonVariableTypeMode for USE_STATIC_DISPATCH
 on JIT->ATen path (#27274)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27274

This is yet another fix to address #26764.

PR #26908 toggles NonVariableTypeMode in ATen dispatcher, which is where
USE_STATIC_DISPATCH takes place thus it's most logically sound place to do
such tweaks.

However, we observed nontrivial perf regression due to this fix. Turns out
the numel() tensor method gets called in several for-loops thus incurs ~7M
thread_local updates in a single forward call:
```
7173330 numel
    558 size
    416 q_scale
    302 _empty_affine_quantized
    288 contiguous
    257 q_zero_point
    216 qscheme
    173 empty
    110 set_
    105 as_strided
    104 permute
...
```

As numel() is not called from a single place so a natural workaround is to
update function_wrapper.py so that it only adds the guard on gen_namespace_function()
case and ignore the gen_tensor_method() case. But some tensor methods are actually
being called from JIT side directly (e.g. "aten::eq_" -> "(self).eq_") so the
only "band aid" left on the table is to insert guard on JIT->aten path as originally
did on #26868 - this is a simplified version of it as it doesn't hurt to extend the
NonVariableMode scope a little bit to also cover stack drop/pack calls.

On Android we only expose JIT API so we don't need worry about TensorMethods being
called directly. On iOS we don't provide a wrapper yet but we can mention this caveat
in the doc. Hopefully by the time it's widely used we can finish Variable/Tensor
unification and remove all these hacks.

Test Plan:
- Verified it runs quantized/fp32 MobileNetV2 models;
- Verified it fixes the perf regression (revert #26908 separately);

Differential Revision: D17732489

Pulled By: ljk53

fbshipit-source-id: c14ca66aebc6b6f17ad6efac7ca47f9487c98de5
---
 tools/jit/gen_jit_dispatch.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index f3bd37cf1b069..563c8ec8379ee 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -160,9 +160,15 @@ def from_ivalue(arg, value):
 auto result_ = (${first}).${name}(${args_with_tensor_options});
 """)
 
+# Adding `AutoNonVariableTypeMode` guard for `USE_STATIC_DISPATCH` case is kinda
+# hack to address issue #26764. TODO: remove this hack after Variable/Tensor
+# unification (#23032) is done.
 CONSTRUCTOR = CodeTemplate("""\
 [](Stack & stack) {
     ${lvalues}
+#ifdef USE_STATIC_DISPATCH
+    at::AutoNonVariableTypeMode non_var_type_mode(true);
+#endif
     ${call}
     drop(stack, ${num_inputs});
     pack(stack, std::move(result_));

From 082f195ac9ec7e2ee9b1d25329b357cb2cedd6f3 Mon Sep 17 00:00:00 2001
From: svcscm <svcscm@fb.com>
Date: Thu, 3 Oct 2019 12:57:29 -0700
Subject: [PATCH 021/113] Updating submodules

Summary:
GitHub commits:

https://github.com/pytorch/fbgemm/commit/8786c0819029c076b0e28320e880ba3ac192ea8b

Test Plan: n/a

Reviewed By: zpao

fbshipit-source-id: 9c04a2ba7cc2166db0203f186ece261ca8b186dd
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index c8b854042b364..8786c0819029c 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit c8b854042b364080af3c45da0db27cccbbe9b219
+Subproject commit 8786c0819029c076b0e28320e880ba3ac192ea8b

From 1c5683a80f940af24db856944d0b8356f3da101a Mon Sep 17 00:00:00 2001
From: Supriya Rao <supriyar@fb.com>
Date: Thu, 3 Oct 2019 13:34:36 -0700
Subject: [PATCH 022/113] Avoid calling tensor.numel() in for loops (#27298)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27298

PR #26908 toggles NonVariableTypeMode in ATen dispatcher, which is where
USE_STATIC_DISPATCH takes place.
This causes an issue with numel() as it gets called through the dispatch mode and probably not getting inlined.
Also the thread local state is expensive to read/write so many times and this kills perf.

PR #27274 is another approach to fix this and has more details.

Test Plan:
Quantized mobilenetV2 perf before this change
Main run finished. Milliseconds per iter: 28.6782. Iters per second: 34.8696

Perf after this change
Main run finished. Milliseconds per iter: 22.2585. Iters per second: 44.9267

Imported from OSS

Differential Revision: D17742565

fbshipit-source-id: 43c6045cc001c46916ba339555c9d809a2537eff
---
 aten/src/ATen/native/quantized/cpu/qconv.cpp           | 3 ++-
 aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp   | 3 ++-
 aten/src/ATen/native/quantized/cpu/qlinear.cpp         | 3 ++-
 aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp | 3 ++-
 aten/src/ATen/quantized/Quantizer.cpp                  | 6 ++++--
 5 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp
index 208f5b10581bf..48a973bc946c2 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@@ -360,7 +360,8 @@ class QConv2dInt8 final : public c10::OperatorKernel {
           kernel_zp,
           MemoryFormat::ChannelsLast);
       auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
-      for (int i = 0; i < weight_contig.numel(); ++i) {
+      auto wt_numel = weight_contig.numel();
+      for (int i = 0; i < wt_numel; ++i) {
         qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
       }
       // Original bias was float, so we requantize it here.
diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
index d90871421c20d..70c634d65d8eb 100644
--- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp
@@ -218,7 +218,8 @@ class QConvPackWeightInt8 final : public c10::OperatorKernel {
         weight.q_scale(),
         weight_zp);
     auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
-    for (int i = 0; i < weight_contig.numel(); ++i) {
+    auto wt_numel = weight_contig.numel();
+    for (int i = 0; i < wt_numel; ++i) {
       qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
     }
     // We set the pre-packed conv weights to nullptr below as we call pre-pack
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
index 55db78e4b3026..08ab39cfb9642 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@@ -249,7 +249,8 @@ class QLinearInt8 final : public torch::OperatorKernel {
           kernel_scale,
           kernel_zp);
       auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
-      for (int i = 0; i < weight_contig.numel(); ++i) {
+      auto wt_numel = weight_contig.numel();
+      for (int i = 0; i < wt_numel; ++i) {
         qnnp_w_data[i] = static_cast<c10::quint8>(w_data[i] + 128);
       }
       // Original bias was float, so we requantize it here.
diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
index bcd0b690b9f63..ea6db888fd31d 100644
--- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp
@@ -163,7 +163,8 @@ class QLinearPackWeightInt8 final : public c10::OperatorKernel {
         weight.q_scale(),
         weight_zp);
     auto* qnnp_w_data = qnnp_weight.data_ptr<c10::quint8>();
-    for (int i = 0; i < weight_contig.numel(); ++i) {
+    auto wt_numel = weight_contig.numel();
+    for (int i = 0; i < wt_numel; ++i) {
       qnnp_w_data[i] = static_cast<c10::quint8>(inp_data[i] + 128);
     }
     initQNNPACK();
diff --git a/aten/src/ATen/quantized/Quantizer.cpp b/aten/src/ATen/quantized/Quantizer.cpp
index 69956aaba0729..5a1a7a94d861d 100644
--- a/aten/src/ATen/quantized/Quantizer.cpp
+++ b/aten/src/ATen/quantized/Quantizer.cpp
@@ -297,7 +297,8 @@ Tensor quantize_tensor(Tensor rtensor, Tensor qtensor, double scale, int64_t zer
   }
 #endif
   auto qdata = qtensor.data_ptr<T>();
-  for (int i = 0; i < rtensor.numel(); ++i) {
+  auto numel = rtensor.numel();
+  for (int i = 0; i < numel; ++i) {
     qdata[i] = quantize_val<T>(scale, zero_point, rdata[i]);
   }
   return qtensor;
@@ -318,7 +319,8 @@ Tensor dequantize_tensor(Tensor qtensor, Tensor rtensor, double scale, int64_t z
   checkZeroPoint<typename T::underlying>(fn_name, zero_point);
   const auto* qd = qtensor.data_ptr<T>();
   float* rd = rtensor.data_ptr<float>();
-  for (auto i = 0; i < qtensor.numel(); ++i) {
+  auto numel = qtensor.numel();
+  for (auto i = 0; i < numel; ++i) {
     rd[i] = dequantize_val<T>(scale, zero_point, qd[i]);
   }
   return rtensor;

From d6336b0d8b431b029c791668d696140e4bed7599 Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Thu, 3 Oct 2019 13:37:17 -0700
Subject: [PATCH 023/113] Fix circle CI

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27307

Test Plan: Imported from OSS

Differential Revision: D17746444

Pulled By: xta0

fbshipit-source-id: ed37f91921f1ea7db6c63ba69f04883856341c39
---
 .circleci/config.yml                            | 2 +-
 .circleci/scripts/binary_ios_build.sh           | 2 +-
 .circleci/verbatim-sources/job-specs-custom.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a10e63046c795..ddb1cd5401701 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1333,7 +1333,7 @@ jobs:
             export PATH="~/anaconda/bin:${PATH}"
             source ~/anaconda/bin/activate
             # Install dependencies
-            conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests
+            conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
             # sync submodules
             cd ${PROJ_ROOT}
             git submodule sync
diff --git a/.circleci/scripts/binary_ios_build.sh b/.circleci/scripts/binary_ios_build.sh
index c15813b5c5d71..900df30ec60a5 100644
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@@ -13,7 +13,7 @@ chmod +x ~/Downloads/conda.sh
 export PATH="~/anaconda/bin:${PATH}"
 source ~/anaconda/bin/activate
 # Install dependencies
-conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests
+conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
 # sync submodules
 cd ${PROJ_ROOT}
diff --git a/.circleci/verbatim-sources/job-specs-custom.yml b/.circleci/verbatim-sources/job-specs-custom.yml
index 7f089a760f35e..d0cb06a8ce092 100644
--- a/.circleci/verbatim-sources/job-specs-custom.yml
+++ b/.circleci/verbatim-sources/job-specs-custom.yml
@@ -429,7 +429,7 @@
             export PATH="~/anaconda/bin:${PATH}"
             source ~/anaconda/bin/activate
             # Install dependencies
-            conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests
+            conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing requests --yes
             # sync submodules
             cd ${PROJ_ROOT}
             git submodule sync

From 75d8dab9bedc060b9b0f4ecd93c6838d85e34ef2 Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Thu, 3 Oct 2019 13:42:12 -0700
Subject: [PATCH 024/113] Update the link for iOS demo app in README.md
 (#27145)

Summary:
Update the link for iOS demo app in README.md
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27145

Differential Revision: D17746591

Pulled By: xta0

fbshipit-source-id: 6f49a0daddc8b79804e1b8487ba1db3807a3f481
---
 ios/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ios/README.md b/ios/README.md
index 1dc4f65d8777d..cd2d31afa1247 100644
--- a/ios/README.md
+++ b/ios/README.md
@@ -17,7 +17,7 @@ For Objective-C developers, simply import the umbrella header
 #import <LibTorch/LibTorch.h>
 ```
 
-For Swift developers, you need to create an Objective-C class as a bridge to call the C++ APIs. We highly recommend you to follow the [Image Classification](https://github.com/pytorch/examples) demo where you can find out how C++, Objective-C and Swift work together. 
+For Swift developers, you need to create an Objective-C class as a bridge to call the C++ APIs. We highly recommend you to follow the [Image Classification](https://github.com/pytorch/ios-demo-app/tree/master/PyTorchDemo) demo where you can find out how C++, Objective-C and Swift work together. 
 
 ### Disable Bitcode 
 
@@ -25,4 +25,4 @@ Since PyTorch is not yet built with bitcode support, you need to disable bitcode
 
 ## LICENSE
 
-PyTorch is BSD-style licensed, as found in the LICENSE file.
\ No newline at end of file
+PyTorch is BSD-style licensed, as found in the LICENSE file.

From 6e2a8d0625ef7a3ea152e6abcfa4026fcc5b3b88 Mon Sep 17 00:00:00 2001
From: Igor Fedan <ifedan@fb.com>
Date: Thu, 3 Oct 2019 14:08:26 -0700
Subject: [PATCH 025/113] Allow use cpu_serial_kernel with void-lambda (#27271)

Summary:
Currently we use CPU_tensor_apply1 to loop through the tensor in single thread and aggregate data:
```
// compute variance per input
 accscalar_t var_sum = 0;
 CPU_tensor_apply1<scalar_t>(in, [&] (const scalar_t& i) {
    var_sum += (i - mean) * (i - mean);
 });
```
and we don't have the ability to use TensorIterator for this.

```
accscalar_t var_sum = 0;
auto iter = TensorIterator::unary_op(self, self);
  cpu_serial_kernel(iter, [&](scalar_t i) -> scalar_t {
        var_sum += (i - mean) * (i - mean);
  return a; //Unable to set value back, because self should be const
});
```

This PR should resolve this problem and allow to use void-lambda:
```
auto iter = at::TensorIterator();
iter.add_input(in);
iter.build();
accscalar_t var_sum = 0;                                                            \
at::native::cpu_serial_kernel(iter, [&](scalar_t i) -> void {
   var_sum += (i - mean) * (i - mean);
});
```

In the future it make sense to change Reduction part and allow to reduce to a scalar, not just to a tensor
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27271

Differential Revision: D17743310

Pulled By: ifedan

fbshipit-source-id: a149751f2d671aefd3ed84bd50b2c0543a63b701
---
 aten/src/ATen/native/cpu/IsContiguous.h     | 40 ++++++++++++++----
 aten/src/ATen/native/cpu/Loops.h            | 41 ++++++++++++++-----
 aten/src/ATen/test/tensor_iterator_test.cpp | 45 ++++++++++++++++++++-
 3 files changed, 108 insertions(+), 18 deletions(-)

diff --git a/aten/src/ATen/native/cpu/IsContiguous.h b/aten/src/ATen/native/cpu/IsContiguous.h
index 6392746a8013a..b44e501171fff 100644
--- a/aten/src/ATen/native/cpu/IsContiguous.h
+++ b/aten/src/ATen/native/cpu/IsContiguous.h
@@ -5,34 +5,58 @@ namespace at { namespace native { namespace {
 // n: number of function arguments (arity)
 // traits: function_traits (see FunctionTraits.h)
 // s: index of scalar argument or -1
-template <int n, typename traits, int s=-1>
+template <int n, int stride_index, typename traits, int s=-1>
 struct IsContiguous {
   static bool eval(const int64_t* strides) {
     using type = typename traits::template arg<n - 1>::type;
-    return strides[n] == (s == n ? 0 : sizeof(type)) &&
-        IsContiguous<n - 1, traits, s>::eval(strides);
+    return strides[stride_index] == (s == n ? 0 : sizeof(type)) &&
+           IsContiguous<n - 1, stride_index - 1, traits, s>::eval(strides);
   }
 };
 
+// will be called when there is an output exists
 template <typename traits, int s>
-struct IsContiguous<0, traits, s> {
+struct IsContiguous<0, 0, traits, s> {
   static bool eval(const int64_t* strides) {
     return strides[0] == sizeof(typename traits::result_type);
   }
 };
 
+// will be called when there is no output
+template <typename traits, int s>
+struct IsContiguous<0, -1, traits, s> {
+  static bool eval(const int64_t* strides) {
+    return true;
+  }
+};
+
 // output and all inputs are contiguous
-template <typename traits>
+template <typename traits,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
 static inline bool is_contiguous(const int64_t* strides) {
-  return IsContiguous<traits::arity, traits>::eval(strides);
+  return IsContiguous<traits::arity, traits::arity - 1, traits>::eval(strides);
+}
+
+template <typename traits,
+    typename std::enable_if<not std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity, traits>::eval(strides);
 }
 
 // input at `s` is scalar (stride 0); output and other inputs are contiguous
 // NB: output is typically at strides[0] so first input corresponds to s=1
-template <typename traits, int s>
+template <typename traits, int s,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity - 1, traits, s>::eval(strides);
+}
+
+template <typename traits, int s,
+    typename std::enable_if<not std::is_void<typename traits::result_type>::value>::type* = nullptr>
 static inline bool is_contiguous_scalar(const int64_t* strides) {
   static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
-  return IsContiguous<traits::arity, traits, s>::eval(strides);
+  return IsContiguous<traits::arity, traits::arity, traits, s>::eval(strides);
 }
 
 }}}
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index e4bc9bc492b49..7ae0e4d0e93ba 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -80,13 +80,40 @@ dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& o
   return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
 }
 
+template <typename func_t,
+    typename std::enable_if<not std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
+  using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
+  for (; i < n; i++) {
+    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+    *out_ptr = c10::guts::apply(op, dereference<traits>(
+        &data[1],
+        &strides[1],
+        i));
+  }
+}
+
+template <typename func_t,
+    typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
+  using traits = function_traits<func_t>;
+  for (; i < n; i++) {
+    c10::guts::apply(op, dereference<traits>(
+        &data[0],
+        &strides[0],
+        i));
+  }
+}
+
 // Basic loop operation (one output, N inputs). May be auto-vectorized
 // by the compiler. Supports inputs and outputs of different types.
 template <typename func_t>
 static inline void
 basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t op) {
   using traits = function_traits<func_t>;
-  using result_type = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;
 
   // Copying strides to temporary array helps auto vectorization in older GCC
@@ -96,13 +123,7 @@ basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_
     strides[arg] = strides_[arg];
   }
 
-  for (; i < n; i++) {
-    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
-    *out_ptr = c10::guts::apply(op, dereference<traits>(
-      &data[1],
-      &strides[1],
-      i));
-  }
+  execute_op(data, strides, i, n, op);
 }
 
 // Explicitly vectorized loop implementation. All inputs and outputs must be
@@ -205,7 +226,8 @@ void cpu_kernel_vec(TensorIterator& iter, func_t op, vec_func_t vop) {
 template <typename func_t>
 void cpu_serial_kernel(TensorIterator& iter, func_t op) {
   using traits = function_traits<func_t>;
-  TORCH_INTERNAL_ASSERT(iter.ntensors() >= traits::arity + 1);
+  TORCH_INTERNAL_ASSERT((std::is_void<typename traits::result_type>::value &&
+    iter.noutputs() == 0 && iter.ntensors() == traits::arity) || (iter.ntensors() >= traits::arity + 1));
 
   iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
     if (is_contiguous<traits>(strides)) {
@@ -217,6 +239,7 @@ void cpu_serial_kernel(TensorIterator& iter, func_t op) {
       });
     }
   }, {0, iter.numel()});
+  iter.cast_outputs();
 }
 
 }}}  // namespace at::native::<anonymous>
diff --git a/aten/src/ATen/test/tensor_iterator_test.cpp b/aten/src/ATen/test/tensor_iterator_test.cpp
index ef4d17a2f8b3f..6939d32ac7686 100644
--- a/aten/src/ATen/test/tensor_iterator_test.cpp
+++ b/aten/src/ATen/test/tensor_iterator_test.cpp
@@ -61,17 +61,41 @@ TEST(TensorIteratorTest, SerialLoopUnary_##name) {                            \
   ASSERT_ANY_THROW(out.equal(expected));                                      \
 }
 
+#define NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE(ctype,name)                         \
+TEST(TensorIteratorTest, SerialLoopUnaryNoOutput_##name) {                     \
+  auto in = random_tensor_for_type(k##name);                                   \
+  auto iter = at::TensorIterator();                                            \
+  iter.add_input(in);                                                          \
+  iter.build();                                                                \
+  int64_t acc = 0;                                                             \
+  at::native::cpu_serial_kernel(iter, [&](ctype a) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in.numel());                                              \
+}
+
 #define BINARY_TEST_ITER_FOR_TYPE(ctype,name)                                          \
 TEST(TensorIteratorTest, SerialLoopBinary_##name) {                                    \
   Tensor out;                                                                          \
   auto in1 = random_tensor_for_type(k##name);                                          \
   auto in2 = random_tensor_for_type(k##name);                                          \
   auto expected = in1.add(in2);                                                        \
-  auto iter = TensorIterator::binary_op(out, in1, in2);                           \
+  auto iter = TensorIterator::binary_op(out, in1, in2);                                \
   at::native::cpu_serial_kernel(iter, [=](ctype a, ctype b) -> int { return a + b; }); \
   ASSERT_ANY_THROW(out.equal(expected));                                               \
 }
 
+#define NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE(ctype,name)                          \
+TEST(TensorIteratorTest, SerialLoopBinaryNoOutput_##name) {                      \
+  auto in1 = random_tensor_for_type(k##name);                                    \
+  auto in2 = random_tensor_for_type(k##name);                                    \
+  auto iter = at::TensorIterator();                                              \
+  iter.add_input(in1);                                                           \
+  iter.add_input(in2);                                                           \
+  iter.build();                                                                  \
+  int64_t acc = 0;                                                               \
+  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in1.numel());                                               \
+}
+
 #define POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                                    \
 TEST(TensorIteratorTest, SerialLoopPointwise_##name) {                                              \
   Tensor out;                                                                                       \
@@ -89,6 +113,21 @@ TEST(TensorIteratorTest, SerialLoopPointwise_##name) {
   ASSERT_ANY_THROW(out.equal(expected));                                                            \
 }
 
+#define NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                \
+TEST(TensorIteratorTest, SerialLoopPoinwiseNoOutput_##name) {                             \
+  auto in1 = random_tensor_for_type(k##name);                                             \
+  auto in2 = random_tensor_for_type(k##name);                                             \
+  auto in3 = random_tensor_for_type(k##name);                                             \
+  auto iter = at::TensorIterator();                                                       \
+  iter.add_input(in1);                                                                    \
+  iter.add_input(in2);                                                                    \
+  iter.add_input(in3);                                                                    \
+  iter.build();                                                                           \
+  int64_t acc = 0;                                                                        \
+  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b, ctype c) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in1.numel());                                                        \
+}
+
 // The alternative way to calculate a < b is (b - a).clamp(0).toBool()
 // To prevent an overflow in subtraction (b - a) for unsigned types(unit, bool)
 // we will convert in to int first
@@ -112,6 +151,9 @@ TEST(TensorIteratorTest, ComparisonLoopBinary_##name) {
 AT_FORALL_SCALAR_TYPES(UNARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(BINARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(POINTWISE_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES_AND(Bool, COMPARISON_TEST_ITER_FOR_TYPE)
 
 TEST(TensorIteratorTest, SerialLoopSingleThread) {
@@ -172,3 +214,4 @@ TEST(TensorIteratorTest, DoNotComputeCommonDTypeIfOutputIsUndefined) {
   iter.compute_common_dtype_only_for_inputs();
   ASSERT_ANY_THROW(iter.build());
 }
+

From 2e62318243fe9e273f691bd80036b33346549455 Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Thu, 3 Oct 2019 14:52:29 -0700
Subject: [PATCH 026/113] Move the CUDA implementation of log10 to ATen.
 (#26733)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26733

Close #24587

Test Plan: Imported from OSS

Differential Revision: D17606981

Pulled By: VitalyFedyunin

fbshipit-source-id: 732f07b981287da3ca235b272b7b6f78144f8ebe
---
 aten/src/ATen/Declarations.cwrap               | 14 --------------
 aten/src/ATen/core/TensorMethods.h             |  8 +-------
 aten/src/ATen/native/UnaryOps.cpp              |  5 ++++-
 aten/src/ATen/native/cuda/CUDAUnaryOps.cpp     |  1 -
 aten/src/ATen/native/cuda/UnaryOpsKernel.cu    |  9 +++++++++
 aten/src/ATen/native/native_functions.yaml     |  7 ++-----
 aten/src/TH/THGeneral.cpp                      |  5 -----
 aten/src/TH/generic/THTensorMath.h             |  1 -
 aten/src/TH/generic/THVector.h                 |  1 -
 aten/src/TH/generic/THVectorDefault.cpp        |  1 -
 aten/src/THC/THCGeneral.h.in                   |  1 -
 aten/src/THC/THCNumerics.cuh                   |  3 ---
 aten/src/THC/generic/THCTensorMathPointwise.cu |  1 -
 aten/src/THC/generic/THCTensorMathPointwise.h  |  1 -
 aten/src/THCUNN/THCHalfAutoNumerics.cuh        |  4 ----
 test/test_torch.py                             |  2 +-
 16 files changed, 17 insertions(+), 47 deletions(-)

diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index 5e9cf532ba723..4da3e87a68f85 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -762,20 +762,6 @@
       output: True
     - THTensor* self
 ]]
-[[
-  name: _th_log10
-  cname: log10
-  types:
-    - floating_point
-  backends:
-    - CUDA
-  variants: function
-  return: argument 0
-  arguments:
-    - arg: THTensor* result
-      output: True
-    - THTensor* self
-]]
 [[
   name: _th_log1p
   cname: log1p
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index e5e3de59248bc..9003a116aa42c 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -1547,13 +1547,7 @@ inline Tensor Tensor::log10() const {
 inline Tensor & Tensor::log10_() const {
 #ifdef USE_STATIC_DISPATCH
     at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::log10_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("log10_ not implemented for ", at::toString(type_set()));
-    }
+    return TypeDefault::log10_(const_cast<Tensor&>(*this));
 #else
     static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log10_", ""}).value();
     return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp
index 15e67644b3ec2..3b26cd85f26c3 100644
--- a/aten/src/ATen/native/UnaryOps.cpp
+++ b/aten/src/ATen/native/UnaryOps.cpp
@@ -77,6 +77,10 @@ Tensor& log_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(r
 Tensor log(const Tensor& self) { return unary_op_impl(self, at::log_out); }
 Tensor& log_(Tensor& self) { return unary_op_impl_(self, at::log_out); }
 
+Tensor& log10_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, log10_stub); }
+Tensor log10(const Tensor& self) { return unary_op_impl(self, at::log10_out); }
+Tensor& log10_(Tensor& self) { return unary_op_impl_(self, at::log10_out); }
+
 Tensor& round_out(Tensor& result, const Tensor& self) { return unary_op_impl_out(result, self, round_stub); }
 Tensor round(const Tensor& self) { return unary_op_impl(self, at::round_out); }
 Tensor& round_(Tensor& self) { return unary_op_impl_(self, at::round_out); }
@@ -269,7 +273,6 @@ IMPLEMENT_UNARY_OP_VEC(erfc)
 IMPLEMENT_UNARY_OP_VEC_CUDA(erfinv)
 IMPLEMENT_UNARY_OP_VEC(exp)
 IMPLEMENT_UNARY_OP_VEC(frac)
-IMPLEMENT_UNARY_OP_VEC(log10)
 IMPLEMENT_UNARY_OP_VEC(log1p)
 IMPLEMENT_UNARY_OP_VEC(log2)
 IMPLEMENT_UNARY_OP_VEC(reciprocal)
diff --git a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
index e18bfd3e05808..61930831796d0 100644
--- a/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
+++ b/aten/src/ATen/native/cuda/CUDAUnaryOps.cpp
@@ -74,7 +74,6 @@ IMPLEMENT_UNARY_OP_PREQUEL(erf)
 IMPLEMENT_UNARY_OP_PREQUEL(erfc)
 IMPLEMENT_UNARY_OP_PREQUEL(exp)
 IMPLEMENT_UNARY_OP_PREQUEL(frac)
-IMPLEMENT_UNARY_OP_PREQUEL(log10)
 IMPLEMENT_UNARY_OP_PREQUEL(log1p)
 IMPLEMENT_UNARY_OP_PREQUEL(log2)
 IMPLEMENT_UNARY_OP_PREQUEL(reciprocal)
diff --git a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
index 9c6695d32c3d6..54b922f69064f 100644
--- a/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
+++ b/aten/src/ATen/native/cuda/UnaryOpsKernel.cu
@@ -65,6 +65,14 @@ void log_kernel_cuda(TensorIterator& iter) {
   });
 }
 
+void log10_kernel_cuda(TensorIterator& iter) {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(iter.dtype(), "log10_cuda", [&]() {
+    gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
+      return ::log10(a);
+    });
+  });
+}
+
 void neg_kernel_cuda(TensorIterator& iter) {
   AT_DISPATCH_ALL_TYPES_AND(ScalarType::Half, iter.dtype(), "neg_cuda", [&]() {
     gpu_kernel(iter, []GPU_LAMBDA(scalar_t a) -> scalar_t {
@@ -180,6 +188,7 @@ REGISTER_DISPATCH(ceil_stub, &ceil_kernel_cuda);
 REGISTER_DISPATCH(expm1_stub, &expm1_kernel_cuda);
 REGISTER_DISPATCH(floor_stub, &floor_kernel_cuda);
 REGISTER_DISPATCH(log_stub, &log_kernel_cuda);
+REGISTER_DISPATCH(log10_stub, &log10_kernel_cuda);
 REGISTER_DISPATCH(neg_stub, &neg_kernel_cuda);
 REGISTER_DISPATCH(round_stub, &round_kernel_cuda);
 REGISTER_DISPATCH(rsqrt_stub, &rsqrt_kernel_cuda);
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 8d37c05ae69af..b4ce663e6cb58 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1539,15 +1539,12 @@
   use_c10_dispatcher: unboxed_only
   supports_named_tensor: True
   variants: function, method
-  dispatch:
-    CPU: _log10__cpu
-    CUDA: _log10__cuda
 
 - func: log10.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)
   supports_named_tensor: True
   dispatch:
-    CPU: _log10_out_cpu
-    CUDA: _log10_out_cuda
+    CPU: log10_out
+    CUDA: log10_out
 
 - func: log1p(Tensor self) -> Tensor
   use_c10_dispatcher: full
diff --git a/aten/src/TH/THGeneral.cpp b/aten/src/TH/THGeneral.cpp
index f921673549f88..060b1e6cebe55 100644
--- a/aten/src/TH/THGeneral.cpp
+++ b/aten/src/TH/THGeneral.cpp
@@ -188,11 +188,6 @@ void THFree(void *ptr)
   c10::free_cpu(ptr);
 }
 
-double THLog10(const double x)
-{
-  return log10(x);
-}
-
 double THLog1p(const double x)
 {
 #if (defined(_MSC_VER) || defined(__MINGW32__))
diff --git a/aten/src/TH/generic/THTensorMath.h b/aten/src/TH/generic/THTensorMath.h
index 600ae40057a7b..97b88cf3cdd91 100644
--- a/aten/src/TH/generic/THTensorMath.h
+++ b/aten/src/TH/generic/THTensorMath.h
@@ -151,7 +151,6 @@ TH_API void THTensor_(abs)(THTensor *r_, THTensor *t);
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
 TH_API void THTensor_(sigmoid)(THTensor *r_, THTensor *t);
-TH_API void THTensor_(log10)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(log1p)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(log2)(THTensor *r_, THTensor *t);
 TH_API void THTensor_(exp)(THTensor *r_, THTensor *t);
diff --git a/aten/src/TH/generic/THVector.h b/aten/src/TH/generic/THVector.h
index c29d4503de167..d6fffd74d6bdc 100644
--- a/aten/src/TH/generic/THVector.h
+++ b/aten/src/TH/generic/THVector.h
@@ -31,7 +31,6 @@ TH_API void THVector_(abs)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 /* floating point only now */
 #if defined(TH_REAL_IS_FLOAT) || defined(TH_REAL_IS_DOUBLE)
 
-TH_API void THVector_(log10)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 TH_API void THVector_(log1p)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 TH_API void THVector_(log2)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
 TH_API void THVector_(sigmoid)(scalar_t *y, const scalar_t *x, const ptrdiff_t n);
diff --git a/aten/src/TH/generic/THVectorDefault.cpp b/aten/src/TH/generic/THVectorDefault.cpp
index 6814cdc362c6b..e847cda996af4 100644
--- a/aten/src/TH/generic/THVectorDefault.cpp
+++ b/aten/src/TH/generic/THVectorDefault.cpp
@@ -243,7 +243,6 @@ VECTOR_IMPLEMENT_FUNCTION(abs,)
 #define TH_MATH_NAME(fn) fn
 #endif
 
-VECTOR_IMPLEMENT_FUNCTION(log10,TH_MATH_NAME(log10))
 VECTOR_IMPLEMENT_FUNCTION(log1p,TH_MATH_NAME(log1p))
 VECTOR_IMPLEMENT_FUNCTION(log2,TH_MATH_NAME(log2))
 VECTOR_IMPLEMENT_FUNCTION(sigmoid_DEFAULT,TH_MATH_NAME(TH_sigmoid))
diff --git a/aten/src/THC/THCGeneral.h.in b/aten/src/THC/THCGeneral.h.in
index 6a086db395086..fac4e04b3f32d 100644
--- a/aten/src/THC/THCGeneral.h.in
+++ b/aten/src/THC/THCGeneral.h.in
@@ -3,7 +3,6 @@
 
 #include <TH/THGeneral.h>
 #include <TH/THAllocator.h>
-#undef log10
 #undef log1p
 #undef log2
 
diff --git a/aten/src/THC/THCNumerics.cuh b/aten/src/THC/THCNumerics.cuh
index 7e6e932dfa503..b80335958ca4b 100644
--- a/aten/src/THC/THCNumerics.cuh
+++ b/aten/src/THC/THCNumerics.cuh
@@ -204,7 +204,6 @@ struct THCNumerics<at::Half> {
 
   static inline __host__ __device__ at::Half exp(at::Half a) { return std::exp(a); }
   static inline __host__ __device__ at::Half exp10(at::Half a) { return ::exp10(a); }
-  static inline __host__ __device__ at::Half log10(at::Half a) { return ::log10(a); }
   static inline __host__ __device__ at::Half log1p(at::Half a) { return ::log1p(a); }
   static inline __host__ __device__ at::Half log2(at::Half a) { return ::log2(a); }
   static inline __host__ __device__ at::Half cos(at::Half a) { return ::cos(a); }
@@ -279,7 +278,6 @@ struct THCNumerics<float> {
 
   static inline __host__ __device__  float exp  (float a) { return   expf(a); }
   static inline __host__ __device__  float exp10(float a) { return exp10f(a); }
-  static inline __host__ __device__  float log10(float a) { return log10f(a); }
   static inline __host__ __device__  float log1p(float a) { return log1pf(a); }
   static inline __host__ __device__  float log2 (float a) { return  log2f(a); }
   static inline __host__ __device__  float cos  (float a) { return   cosf(a); }
@@ -329,7 +327,6 @@ struct THCNumerics<double> {
 
   static inline __host__ __device__  double exp  (double a) { return   ::exp(a); }
   static inline __host__ __device__  double exp10(double a) { return ::exp10(a); }
-  static inline __host__ __device__  double log10(double a) { return ::log10(a); }
   static inline __host__ __device__  double log1p(double a) { return ::log1p(a); }
   static inline __host__ __device__  double log2 (double a) { return  ::log2(a); }
   static inline __host__ __device__  double cos  (double a) { return   ::cos(a); }
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.cu b/aten/src/THC/generic/THCTensorMathPointwise.cu
index b3e6a16c1c3f4..8f84d4ee1bf39 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.cu
+++ b/aten/src/THC/generic/THCTensorMathPointwise.cu
@@ -198,7 +198,6 @@ static void propagate_names_if_named_tensor_enabled(THCTensor* result, THCTensor
 
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
-IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log10, THCNumerics<scalar_t>::log10, Real)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(log1p, THCNumerics<scalar_t>::log1p, Real)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC( log2, THCNumerics<scalar_t>::log2,  Real)
 IMPLEMENT_CUDA_TENSOR_BASIC_FUNC(  exp, THCNumerics<scalar_t>::exp,   Real)
diff --git a/aten/src/THC/generic/THCTensorMathPointwise.h b/aten/src/THC/generic/THCTensorMathPointwise.h
index 94e0479c1aa02..3f688ef18b794 100644
--- a/aten/src/THC/generic/THCTensorMathPointwise.h
+++ b/aten/src/THC/generic/THCTensorMathPointwise.h
@@ -16,7 +16,6 @@ THC_API void THCTensor_(cminValue)(THCState *state, THCTensor *self, THCTensor *
 #if defined(THC_REAL_IS_FLOAT) || defined(THC_REAL_IS_DOUBLE) || defined(THC_REAL_IS_HALF)
 
 THC_API void THCTensor_(sigmoid)(THCState *state, THCTensor *self, THCTensor *src);
-THC_API void THCTensor_(log10)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(log1p)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(log2)(THCState *state, THCTensor *self, THCTensor *src);
 THC_API void THCTensor_(exp)(THCState *state, THCTensor *self, THCTensor *src);
diff --git a/aten/src/THCUNN/THCHalfAutoNumerics.cuh b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
index 7b55f18b77c52..9d971315e49dc 100644
--- a/aten/src/THCUNN/THCHalfAutoNumerics.cuh
+++ b/aten/src/THCUNN/THCHalfAutoNumerics.cuh
@@ -39,10 +39,6 @@ inline __host__ __device__ THHalf exp(THHalf a) {
   return THCNumerics<THHalf>::exp(a);
 }
 
-inline __host__ __device__ THHalf log10(THHalf a) {
-  return THCNumerics<THHalf>::log10(a);
-}
-
 inline __host__ __device__ THHalf log1p(THHalf a) {
   return THCNumerics<THHalf>::log1p(a);
 }
diff --git a/test/test_torch.py b/test/test_torch.py
index 5fa75cd631882..b5cf74e149967 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -11603,7 +11603,7 @@ def test_unary_out_op_mem_overlap(self, device):
             ("log", positives, True, True, 'cpu'),
             ("log", positives, True, True, 'cuda'),
             ("log10", positives, True, True, 'cpu'),
-            ("log10", positives, False, True, 'cuda'),
+            ("log10", positives, True, True, 'cuda'),
             ("log1p", positives, True, True, 'cpu'),
             ("log1p", positives, False, True, 'cuda'),
             ("log2", positives, True, True, 'cpu'),

From 0c4bc2753951fb0f98da98355a1cc78bb1e5361a Mon Sep 17 00:00:00 2001
From: Nathan Goldbaum <nathan12343@gmail.com>
Date: Thu, 3 Oct 2019 15:18:47 -0700
Subject: [PATCH 027/113] Mention magma-cuda101 package in install instructions
 (#27325)

Summary:
There is a magma package for the newest CUDA verson (10.1), mention it here lest someone try to mistakenly use the version for CUDA 10.0.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27325

Differential Revision: D17749535

Pulled By: soumith

fbshipit-source-id: 2d34a7af1218e6157935bfd5e03f4d2c0f00f200
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0d99d8c73777c..60366994dd880 100644
--- a/README.md
+++ b/README.md
@@ -175,7 +175,7 @@ conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing
 On Linux
 ```bash
 # Add LAPACK support for the GPU if needed
-conda install -c pytorch magma-cuda90 # or [magma-cuda92 | magma-cuda100 ] depending on your cuda version
+conda install -c pytorch magma-cuda90 # or [magma-cuda92 | magma-cuda100 | magma-cuda101 ] depending on your cuda version
 ```
 
 #### Get the PyTorch Source

From 05df6b67c639009303fa44327dda49a38719bc28 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Thu, 3 Oct 2019 15:32:43 -0700
Subject: [PATCH 028/113] C++ API parity: TensorTest.BackwardNonScalarOutputs

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27314

Test Plan: Imported from OSS

Differential Revision: D17746371

Pulled By: pbelevich

fbshipit-source-id: 246fae22a60ed9a6d7b9843239b4b3391cc9dc3e
---
 test/cpp/api/tensor.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/cpp/api/tensor.cpp b/test/cpp/api/tensor.cpp
index 0a43bd855e73d..4a262cddccc0e 100644
--- a/test/cpp/api/tensor.cpp
+++ b/test/cpp/api/tensor.cpp
@@ -474,6 +474,13 @@ TEST(TensorTest, BackwardCreatesOnesGrad) {
               torch::ones_like(x)));
 }
 
+TEST(TensorTest, BackwardNonScalarOutputs) {
+  auto x = torch::randn({5, 5}, torch::requires_grad());
+  auto y = x * x;
+  ASSERT_THROWS_WITH(y.backward(),
+    "grad can be implicitly created only for scalar outputs");
+}
+
 TEST(TensorTest, IsLeaf) {
   auto x = torch::tensor({5}, at::TensorOptions().requires_grad(true));
   auto y = x * x;

From a444054d4b3d0f027179bd17ab7ae26e378feca9 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Thu, 3 Oct 2019 15:41:24 -0700
Subject: [PATCH 029/113] Fix build (#27318)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27318

Fix TBB build
USE_TBB=1 ATEN_THREADING=TBB python setup.py develop install --cmake

Test Plan: Imported from OSS

Differential Revision: D17747449

Pulled By: ilia-cher

fbshipit-source-id: 421f362bd10f3be34bffe86ae4f26e8f1c15f1a4
---
 torch/lib/c10d/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 2557d233cd849..49d55105573d3 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -16,6 +16,10 @@ else()
   message(STATUS "Building c10d without CUDA/ROCm support")
 endif()
 
+if(USE_TBB)
+include_directories(${TBB_ROOT_DIR}/include)
+endif()
+
 if(USE_GLOO)
   option(USE_C10D_GLOO "USE C10D GLOO" ON)
 endif()

From 74572fc985ace7f024a885218641dbfe0b0dd021 Mon Sep 17 00:00:00 2001
From: Ilia Cherniavskii <iliacher@fb.com>
Date: Thu, 3 Oct 2019 15:46:34 -0700
Subject: [PATCH 030/113] Relax restrictions on set_num_threads (#27190)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27190

Allow set_num_threads to be called multiple times in case of TBB
parallel backend

Test Plan:
BUILD_BINARY=1 USE_TBB=1 ATEN_THREADING=TBB python setup.py develop
install  --cmake
./build/bin/test_parallel
./build/bin/thread_init_test

Reviewed By: kostmo

Differential Revision: D17704236

Pulled By: ilia-cher

fbshipit-source-id: 274380795e78ba417301c5faa18c9e9d3198bd5e
---
 aten/src/ATen/ParallelNative.cpp     |  5 +++--
 aten/src/ATen/ParallelNativeTBB.cpp  | 13 ++++---------
 aten/src/ATen/test/test_parallel.cpp |  9 +++++++++
 torch/_torch_docs.py                 | 10 +++++-----
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp
index f7e51c83fd829..9dff0ee7f1d39 100644
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@@ -179,8 +179,9 @@ void set_num_threads(int nthreads) {
   TORCH_CHECK(nthreads > 0, "Expected positive number of threads");
   int no_value = NOT_SET;
   TORCH_CHECK(num_intraop_threads.compare_exchange_strong(no_value, nthreads),
-      "Error: cannot set number of interop threads "
-      "after parallel work has started or after set_num_threads call");
+      "Error: cannot set number of intraop threads "
+      "after parallel work has started or after set_num_threads call "
+      "when using native parallel backend");
 #else
   TORCH_CHECK(false, "set_num_threads is not supported for mobile.");
 #endif // C10_MOBILE
diff --git a/aten/src/ATen/ParallelNativeTBB.cpp b/aten/src/ATen/ParallelNativeTBB.cpp
index aa002120240f6..2fd6c0b363e30 100644
--- a/aten/src/ATen/ParallelNativeTBB.cpp
+++ b/aten/src/ATen/ParallelNativeTBB.cpp
@@ -21,11 +21,11 @@ namespace at {
 
 namespace {
 static thread_local tbb::task_scheduler_init tbb_init_(intraop_default_num_threads());
-std::atomic<int> num_intraop_threads_{-1};
 static thread_local tbb::task_group tg_;
 
 std::mutex global_thread_mutex_;
 std::shared_ptr<tbb::global_control> global_thread_limit_ = nullptr;
+std::atomic<int> num_intraop_threads_{-1};
 
 void _internal_set_num_threads(int nthreads) {
   TORCH_INTERNAL_ASSERT(nthreads > 0);
@@ -33,6 +33,7 @@ void _internal_set_num_threads(int nthreads) {
     std::unique_lock<std::mutex> lk(global_thread_mutex_);
     global_thread_limit_ = std::make_shared<tbb::global_control>(
         tbb::global_control::max_allowed_parallelism, nthreads);
+    num_intraop_threads_.store(nthreads);
   }
   if (tbb_init_.is_active()) {
     tbb_init_.terminate();
@@ -59,14 +60,8 @@ void init_num_threads() {
 
 void set_num_threads(int nthreads) {
   TORCH_CHECK(nthreads > 0);
-  int no_value = -1;
-  if (num_intraop_threads_.compare_exchange_strong(no_value, nthreads)) {
-    _internal_set_num_threads(nthreads);
-    return;
-  }
-  TORCH_CHECK(false,
-    "Error: cannot set number of interop threads "
-    "after parallel work has started or after set_num_threads call");
+
+  _internal_set_num_threads(nthreads);
 }
 
 int get_num_threads() {
diff --git a/aten/src/ATen/test/test_parallel.cpp b/aten/src/ATen/test/test_parallel.cpp
index 7c1072c672252..a6579ae5f0836 100644
--- a/aten/src/ATen/test/test_parallel.cpp
+++ b/aten/src/ATen/test/test_parallel.cpp
@@ -69,3 +69,12 @@ TEST(TestParallel, IntraOpLaunchFuture) {
 
   ASSERT_TRUE(v1 == 1 && v2 == 2);
 }
+
+TEST(TestParallel, MultipleSetNumThreadsCalls) {
+#if !AT_PARALLEL_NATIVE
+  set_num_threads(5);
+  ASSERT_TRUE(get_num_threads() == 5);
+  set_num_threads(10);
+  ASSERT_TRUE(get_num_threads() == 10);
+#endif
+}
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index c1e99bd22d6f1..edf90bf377054 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -899,12 +899,12 @@ def merge_dicts(*dicts):
 
 Computes the p-norm distance between each pair of the two collections of row vectors.
 
-If x1 has shape :math:`P \times M` and x2 has shape :math:`R \times M` then the 
+If x1 has shape :math:`P \times M` and x2 has shape :math:`R \times M` then the
 output will have shape :math:`P \times R`.
 
-This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)` 
-if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to 
-`scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest 
+This function is equivalent to `scipy.spatial.distance.cdist(input,'minkowski', p=p)`
+if :math:`p \in (0, \infty)`. When :math:`p = 0` it is equivalent to
+`scipy.spatial.distance.cdist(input, 'hamming') * M`. When :math:`p = \infty`, the closest
 scipy function is `scipy.spatial.distance.cdist(xn, lambda x, y: np.abs(x - y).max())`.
 
 Args:
@@ -4569,7 +4569,7 @@ def merge_dicts(*dicts):
            r"""
 set_num_threads(int)
 
-Sets the number of threads used for parallelizing CPU operations.
+Sets the number of threads used for intraop parallelism on CPU.
 WARNING:
 To ensure that the correct number of threads is used, set_num_threads
 must be called before running eager, JIT or autograd code.

From 99c32d97fa9a97690ebf4f824ed3f68da5c8ac16 Mon Sep 17 00:00:00 2001
From: Summer Deng <summerdeng@fb.com>
Date: Thu, 3 Oct 2019 16:10:48 -0700
Subject: [PATCH 031/113] Migrate the cpu and gpu implementations of resize
 nearest 3D from vision to caffe2

Summary: As title. Fix the build failures in unicorn-build-restrictions as discussed in D17330625

Test Plan:
buck test mode/opt caffe2/caffe2/quantization/server:resize_nearest_3d_dnnlowp_op_test

In vision libs, no need to explicitly add dep to resize 3d op as the caffe2_cpu dep is added by default.

Reviewed By: stephenyan1231

Differential Revision: D17676082

fbshipit-source-id: c034ab67a9078f72077b396991ffb9e54e6ab40b
---
 caffe2/operators/resize_3d_op.cc              | 224 ++++++++++++++++++
 caffe2/operators/resize_3d_op.cu              | 219 +++++++++++++++++
 caffe2/operators/resize_3d_op.h               |  92 +++++++
 .../server/resize_nearest_3d_dnnlowp_op.cc    |  64 +++++
 .../server/resize_nearest_3d_dnnlowp_op.h     |  41 ++++
 .../resize_nearest_3d_dnnlowp_op_test.py      |  62 +++++
 6 files changed, 702 insertions(+)
 create mode 100644 caffe2/operators/resize_3d_op.cc
 create mode 100644 caffe2/operators/resize_3d_op.cu
 create mode 100644 caffe2/operators/resize_3d_op.h
 create mode 100644 caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.cc
 create mode 100644 caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.h
 create mode 100644 caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py

diff --git a/caffe2/operators/resize_3d_op.cc b/caffe2/operators/resize_3d_op.cc
new file mode 100644
index 0000000000000..4898711108a35
--- /dev/null
+++ b/caffe2/operators/resize_3d_op.cc
@@ -0,0 +1,224 @@
+#include "caffe2/operators/resize_3d_op.h"
+
+#include "caffe2/utils/math.h"
+
+#ifdef CAFFE2_USE_MKLDNN
+#include "caffe2/ideep/operators/operator_fallback_ideep.h"
+#include "caffe2/ideep/utils/ideep_operator.h"
+#endif
+
+namespace caffe2 {
+
+void resizeNearest3DNCHW2x(
+    int batch_size,
+    int num_channels,
+    int temporal_scale,
+    int input_frames,
+    int input_height,
+    int input_width,
+    const float* input,
+    float* output) {
+  const int output_frames = input_frames * temporal_scale;
+  const int output_height = input_height * 2;
+  const int output_width = input_width * 2;
+  for (int n = 0; n < batch_size; ++n) {
+    for (int c = 0; c < num_channels; ++c) {
+      for (int f = 0; f < output_frames; ++f ) {
+        const int in_f = f / temporal_scale;
+        for (int y = 0; y < output_height; ++y) {
+          const int in_y = y / 2;
+
+          for (int x = 0; x < input_width; ++x) {
+            const float v =
+              input[((in_f * input_height) + in_y) * input_width + x];
+            const int oidx = y * output_width + x * 2;
+            output[oidx + 0] = v;
+            output[oidx + 1] = v;
+          }
+        }
+        output += output_height * output_width;
+      }
+      input += input_frames * input_height * input_width;
+    }
+  }
+}
+
+template <>
+bool ResizeNearest3DOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(0);
+  const auto XDims = X.sizes();
+  CAFFE_ENFORCE_EQ(5, XDims.size());
+
+  const int batch_size = X.dim32(0), num_channels = X.dim32(1),
+            input_frames = X.dim32(2), input_height = X.dim32(3),
+            input_width = X.dim32(4);
+
+  CAFFE_ENFORCE_EQ(InputSize(), 1);
+
+  int output_frames = input_frames * temporal_scale_;
+  int output_height = input_height * height_scale_;
+  int output_width = input_width * width_scale_;
+  auto* Y = Output(
+      0,
+      {batch_size, num_channels, output_frames, output_height, output_width},
+      at::dtype<float>());
+
+  const float* Xdata = X.data<float>();
+  float* Ydata = Y->template mutable_data<float>();
+
+  // Specialized implementation for fast 2x upsampling
+  if (width_scale_ == 2.0 && height_scale_ == 2.0) {
+    CAFFE_ENFORCE(temporal_scale_ == 1 || temporal_scale_ == 2,
+      "temporal_scale must be either 1 or 2");
+
+    resizeNearest3DNCHW2x(
+        batch_size, num_channels, temporal_scale_, input_frames, input_height,
+        input_width, Xdata, Ydata);
+    return true;
+  }
+
+  CAFFE_THROW("Not implemented when height- and width scale are not 2");
+}
+
+template <>
+bool ResizeNearest3DOp<float, CPUContext>::RunOnDevice() {
+  switch (order_) {
+    case StorageOrder::NHWC:
+      CAFFE_THROW("Not implemented for storage order: ", order_);
+    case StorageOrder::NCHW:
+      return RunOnDeviceWithOrderNCHW();
+    default:
+      CAFFE_THROW("Unknown Storage order: ", order_);
+  }
+}
+
+template <>
+bool ResizeNearest3DGradientOp<float, CPUContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& dY = Input(0);
+  const auto& X = Input(1);
+
+  const auto inputDims = dY.sizes();
+  CAFFE_ENFORCE_EQ(5, inputDims.size());
+  const int batch_size = dY.dim32(0), num_channels = dY.dim32(1),
+            input_frames = dY.dim32(2), input_height = dY.dim32(3),
+            input_width = dY.dim32(4);
+
+  const int output_frames = X.dim32(2);
+  const int output_height = X.dim32(3);
+  const int output_width = X.dim32(4);
+
+  CAFFE_ENFORCE_EQ(InputSize(), 2);
+
+  auto* dX = Output(
+      0,
+      {batch_size, num_channels, output_frames, output_height, output_width},
+      at::dtype<float>());
+  math::Set<float, CPUContext>(
+      dX->numel(), 0.0f, dX->template mutable_data<float>(), &context_);
+
+  const float* dYdata = dY.data<float>();
+  float* dXdata = dX->template mutable_data<float>();
+
+  for (int n = 0; n < batch_size; ++n) {
+    for (int c = 0; c < num_channels; ++c) {
+      for (int f = 0; f < input_frames; ++f) {
+        const int out_f =
+          std::min((int)(f / temporal_scale_), output_frames - 1);
+        for (int y = 0; y < input_height; ++y) {
+          const int out_y =
+              std::min((int)(y / height_scale_), (output_height - 1));
+          for (int x = 0; x < input_width; ++x) {
+            const int out_x =
+                std::min((int)(x / width_scale_), (output_width - 1));
+            dXdata[(out_f * output_height + out_y) * output_width + out_x] +=
+              dYdata[(f * input_height + y) * input_width + x];
+          }
+        }
+      }
+      dYdata += input_frames * input_height * input_width;
+      dXdata += output_frames * output_height * output_width;
+    }
+  }
+
+  return true;
+}
+
+template <>
+bool ResizeNearest3DGradientOp<float, CPUContext>::RunOnDevice() {
+  switch (order_) {
+    case StorageOrder::NHWC:
+      CAFFE_THROW("Not implemented for storage order: ", order_);
+    case StorageOrder::NCHW:
+      return RunOnDeviceWithOrderNCHW();
+    default:
+      CAFFE_THROW("Unknown Storage order: ", order_);
+  }
+}
+REGISTER_CPU_OPERATOR(ResizeNearest3D, ResizeNearest3DOp<float, CPUContext>);
+REGISTER_CPU_GRADIENT_OPERATOR(
+    ResizeNearest3DGradient,
+    ResizeNearest3DGradientOp<float, CPUContext>);
+
+#ifdef CAFFE2_USE_MKLDNN
+REGISTER_IDEEP_OPERATOR(
+    ResizeNearest3D,
+    IDEEPFallbackOp<ResizeNearest3DOp<float, CPUContext>>);
+#endif
+
+// Input: X, output: Y
+OPERATOR_SCHEMA(ResizeNearest3D)
+    .NumInputs(1)
+    .NumOutputs(1)
+    .Arg("temporal_scale", "Scale along temporal dimension")
+    .Arg("width_scale", "Scale along width dimension")
+    .Arg("height_scale", "Scale along height dimension")
+    .SetDoc(R"DOC(
+Resizes the spatial dimensions of the input tensor using nearest neighbor
+interpolation. The `width_scale` and `height_scale` arguments
+control the size of the output, which is given by:
+output_width = floor(input_width * width_scale)
+output_height = floor(output_height * height_scale)
+Assumptions:
+  - Only resize height and width
+  - Both width_scale and height_scale scale are 2
+)DOC")
+    .Input(0, "X", "Input tensor")
+    .Output(0, "Y", "Output tensor");
+
+// Input: dY, output: dX
+GRADIENT_OPERATOR_SCHEMA(ResizeNearest3DGradient)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Arg("temporal_scale", "Scale along temporal dimension")
+    .Arg("width_scale", "Scale along width dimension")
+    .Arg("height_scale", "Scale along height dimension");
+
+class GetResizeNearest3DGradient : public GradientMakerBase {
+  using GradientMakerBase::GradientMakerBase;
+  vector<OperatorDef> GetGradientDefs() override {
+    return SingleGradientDef(
+        "ResizeNearest3DGradient",
+        "",
+        vector<string>{GO(0), I(0)},
+        vector<string>{GI(0)});
+  }
+};
+REGISTER_GRADIENT(ResizeNearest3D, GetResizeNearest3DGradient);
+
+} // namespace caffe2
+
+using ResizeNearest3DOpFloatCPU =
+    caffe2::ResizeNearest3DOp<float, caffe2::CPUContext>;
+
+// clang-format off
+C10_EXPORT_CAFFE2_OP_TO_C10_CPU(
+    ResizeNearest3D,
+    "_caffe2::ResizeNearest3D("
+      "Tensor X, "
+      "str order, "
+      "float temporal_scale, "
+      "float width_scale, "
+      "float height_scale"
+    ") -> (Tensor Y)",
+    ResizeNearest3DOpFloatCPU);
+// clang-format on
diff --git a/caffe2/operators/resize_3d_op.cu b/caffe2/operators/resize_3d_op.cu
new file mode 100644
index 0000000000000..378e3bec74a7f
--- /dev/null
+++ b/caffe2/operators/resize_3d_op.cu
@@ -0,0 +1,219 @@
+#include "caffe2/core/context_gpu.h"
+#include "caffe2/operators/resize_3d_op.h"
+#include "caffe2/utils/math.h"
+
+namespace caffe2 {
+
+namespace {
+
+__global__ void NearestNeighbor3DKernel(
+    const int size,
+    const int num_channels,
+    const int input_frames,
+    const int input_height,
+    const int input_width,
+    const int output_frames,
+    const int output_height,
+    const int output_width,
+    const float temporal_scale,
+    const float height_scale,
+    const float width_scale,
+    const float* X,
+    float* Y) {
+  CUDA_1D_KERNEL_LOOP(index, size) {
+    int indexTemp = index;
+    const int w = indexTemp % output_width;
+    indexTemp /= output_width;
+    const int h = indexTemp % output_height;
+    indexTemp /= output_height;
+    const int f = indexTemp % output_frames;
+    indexTemp /= output_frames;
+    const int c = indexTemp % num_channels;
+    indexTemp /= num_channels;
+    const int n = indexTemp;
+
+    const int in_f = fminf(f / temporal_scale, input_frames - 1);
+    const int in_y = fminf(h / height_scale, input_height - 1);
+    const int in_x = fminf(w / width_scale, input_width - 1);
+    Y[index] =
+        X[(((n * num_channels + c) * input_frames + in_f) * input_height + in_y)
+          * input_width + in_x];
+  }
+}
+
+__global__ void NearestNeighbor3DGradientKernel(
+    const int size,
+    const int num_channels,
+    const int input_frames,
+    const int input_height,
+    const int input_width,
+    const int output_frames,
+    const int output_height,
+    const int output_width,
+    const float temporal_scale,
+    const float height_scale,
+    const float width_scale,
+    const float* dY,
+    float* dX) {
+  CUDA_1D_KERNEL_LOOP(index, size) {
+    int indexTemp = index;
+    const int x = indexTemp % input_width;
+    indexTemp /= input_width;
+    const int y = indexTemp % input_height;
+    indexTemp /= input_height;
+    const int f = indexTemp % input_frames;
+    indexTemp /= input_frames;
+    const int c = indexTemp % num_channels;
+    indexTemp /= num_channels;
+    const int n = indexTemp;
+
+    const int out_f = fminf(f / temporal_scale, output_frames - 1);
+    const int out_y = fminf(y / height_scale, output_height - 1);
+    const int out_x = fminf(x / width_scale, output_width - 1);
+    const int out_index =
+        (((n * num_channels + c) * output_frames + out_f) * output_height +
+          out_y) * output_width + out_x;
+#if __CUDA_ARCH__ >= 350
+    atomicAdd(dX + out_index, __ldg(dY + index));
+#else
+    atomicAdd(dX + out_index, *(dY + index));
+#endif
+  }
+}
+
+} // namespace
+
+
+template <>
+bool ResizeNearest3DOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& X = Input(0);
+
+  const auto inputDims = X.sizes();
+  CAFFE_ENFORCE_EQ(5, inputDims.size());
+  const int batch_size = X.dim32(0), num_channels = X.dim32(1),
+            input_frames = X.dim32(2), input_height = X.dim32(3),
+            input_width = X.dim32(4);
+
+  CAFFE_ENFORCE_EQ(InputSize(), 1);
+
+  int output_frames = input_frames * temporal_scale_;
+  int output_height = input_height * height_scale_;
+  int output_width = input_width * width_scale_;
+  auto* Y = Output(
+      0,
+      {batch_size, num_channels, output_frames, output_height, output_width},
+      at::dtype<float>());
+
+  const auto size = Y->numel();
+  NearestNeighbor3DKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      size,
+      num_channels,
+      input_frames,
+      input_height,
+      input_width,
+      output_frames,
+      output_height,
+      output_width,
+      temporal_scale_,
+      height_scale_,
+      width_scale_,
+      X.data<float>(),
+      Y->template mutable_data<float>());
+
+  return true;
+}
+
+template <>
+bool ResizeNearest3DOp<float, CUDAContext>::RunOnDevice() {
+  switch (order_) {
+    case StorageOrder::NHWC:
+      CAFFE_THROW("Not implemented for storage order: ", order_);
+    case StorageOrder::NCHW:
+      return RunOnDeviceWithOrderNCHW();
+    default:
+      CAFFE_THROW("Unknown Storage order: ", order_);
+  }
+}
+
+
+template <>
+bool ResizeNearest3DGradientOp<float, CUDAContext>::RunOnDeviceWithOrderNCHW() {
+  const auto& dY = Input(0);
+  const auto& X = Input(1);
+
+  const auto inputDims = dY.sizes();
+  CAFFE_ENFORCE_EQ(5, inputDims.size());
+  const int batch_size = dY.dim32(0), num_channels = dY.dim32(1),
+            input_frames = dY.dim32(2), input_height = dY.dim32(3),
+            input_width = dY.dim32(4);
+
+  // X,dim32(2) can be different from int(input_frames / temporal_scale_)
+  // We choose to compute output_frames=int(input_frames / temporal_scale_)
+
+  // const int output_frames = X,dim32(2);
+  // const int output_height = X.dim32(3);
+  // const int output_width = X.dim32(4);
+
+  const int output_frames = int(input_frames / temporal_scale_);
+  const int output_height = int(input_height / height_scale_);
+  const int output_width = int(input_width / width_scale_);
+
+  CAFFE_ENFORCE_EQ(InputSize(), 2);
+
+  auto* dX = Output(
+      0,
+      {batch_size, num_channels, output_frames, output_height, output_width},
+      at::dtype<float>());
+  math::Set<float, CUDAContext>(
+      dX->numel(), 0.0f, dX->template mutable_data<float>(), &context_);
+
+  const auto size = dY.numel();
+  NearestNeighbor3DGradientKernel<<<
+      CAFFE_GET_BLOCKS(size),
+      CAFFE_CUDA_NUM_THREADS,
+      0,
+      context_.cuda_stream()>>>(
+      size,
+      num_channels,
+      input_frames,
+      input_height,
+      input_width,
+      output_frames,
+      output_height,
+      output_width,
+      temporal_scale_,
+      height_scale_,
+      width_scale_,
+      dY.data<float>(),
+      dX->template mutable_data<float>());
+
+  return true;
+}
+
+template <>
+bool ResizeNearest3DGradientOp<float, CUDAContext>::RunOnDevice() {
+  switch (order_) {
+    case StorageOrder::NHWC:
+      CAFFE_THROW("Not implemented for storage order: ", order_);
+    case StorageOrder::NCHW:
+      return RunOnDeviceWithOrderNCHW();
+    default:
+      CAFFE_THROW("Unknown Storage order: ", order_);
+  }
+}
+
+REGISTER_CUDA_OPERATOR(ResizeNearest3D, ResizeNearest3DOp<float, CUDAContext>);
+REGISTER_CUDA_OPERATOR(
+    ResizeNearest3DGradient,
+    ResizeNearest3DGradientOp<float, CUDAContext>);
+
+} // namespace caffe2
+
+using ResizeNearest3DOpFloatCUDA =
+    caffe2::ResizeNearest3DOp<float, caffe2::CUDAContext>;
+
+C10_EXPORT_CAFFE2_OP_TO_C10_CUDA(ResizeNearest3D, ResizeNearest3DOpFloatCUDA);
diff --git a/caffe2/operators/resize_3d_op.h b/caffe2/operators/resize_3d_op.h
new file mode 100644
index 0000000000000..1495940ef6637
--- /dev/null
+++ b/caffe2/operators/resize_3d_op.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "caffe2/core/export_caffe2_op_to_c10.h"
+#include "caffe2/core/context.h"
+#include "caffe2/core/operator.h"
+
+C10_DECLARE_EXPORT_CAFFE2_OP_TO_C10(ResizeNearest3D);
+
+
+namespace caffe2 {
+
+template <typename T, class Context>
+class ResizeNearest3DOp final : public Operator<Context> {
+public:
+ template <class... Args>
+ explicit ResizeNearest3DOp(Args&&... args)
+     : Operator<Context>(std::forward<Args>(args)...),
+      temporal_scale_(1),
+      height_scale_(1),
+      width_scale_(1),
+      order_(StringToStorageOrder(
+        this->template GetSingleArgument<std::string>("order", "NCHW"))) {
+    if (HasArgument("temporal_scale")) {
+     temporal_scale_ = static_cast<T>(
+         this->template GetSingleArgument<float>("temporal_scale", 1));
+    }
+    if (HasArgument("height_scale")) {
+     height_scale_ = static_cast<T>(
+         this->template GetSingleArgument<float>("height_scale", 1));
+    }
+    if (HasArgument("width_scale")) {
+     width_scale_ = static_cast<T>(
+         this->template GetSingleArgument<float>("width_scale", 1));
+    }
+
+    CAFFE_ENFORCE_GT(temporal_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+
+    CAFFE_ENFORCE(order_ == StorageOrder::NCHW || order_ == StorageOrder::NHWC);
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+  bool RunOnDeviceWithOrderNCHW();
+
+ protected:
+  T temporal_scale_;
+  T height_scale_;
+  T width_scale_;
+  StorageOrder order_;
+};
+
+template <typename T, class Context>
+class ResizeNearest3DGradientOp final : public Operator<Context> {
+ public:
+  template <class... Args>
+  explicit ResizeNearest3DGradientOp(Args&&... args)
+      : Operator<Context>(std::forward<Args>(args)...),
+        temporal_scale_(1),
+        height_scale_(1),
+        width_scale_(1),
+        order_(StringToStorageOrder(
+            this->template GetSingleArgument<std::string>("order", "NCHW"))) {
+    temporal_scale_ = static_cast<T>(
+        this->template GetSingleArgument<float>("temporal_scale", 1));
+    height_scale_ = static_cast<T>(
+        this->template GetSingleArgument<float>("height_scale", 1));
+    width_scale_ = static_cast<T>(
+        this->template GetSingleArgument<float>("width_scale", 1));
+
+    CAFFE_ENFORCE_GT(temporal_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+
+    CAFFE_ENFORCE(order_ == StorageOrder::NCHW || order_ == StorageOrder::NHWC);
+  }
+
+  USE_OPERATOR_CONTEXT_FUNCTIONS;
+
+  bool RunOnDevice() override;
+  bool RunOnDeviceWithOrderNCHW();
+
+ protected:
+  T temporal_scale_;
+  T height_scale_;
+  T width_scale_;
+  StorageOrder order_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.cc b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.cc
new file mode 100644
index 0000000000000..5e20d62365f6e
--- /dev/null
+++ b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.cc
@@ -0,0 +1,64 @@
+#include "caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.h"
+
+namespace caffe2 {
+
+template <typename T>
+bool ResizeNearest3DDNNLowPOp<T>::RunOnDevice() {
+  using namespace dnnlowp;
+
+  this->ParseDNNLowPOperatorArguments_();
+
+  // Choose quantization params
+  in_qparams_[0] = GetInputTensorQuantizationParamsOf(this, 0, qfactory_.get());
+
+  const auto& X = InputTensorCPU_(0);
+  auto* Y = OutputTensorCPU_(0);
+
+  CAFFE_ENFORCE_EQ(X.ndim(), 5);
+  const int N = X.dim32(0);
+  // input frames
+  const int IF = X.dim32(1);
+  const int IH = X.dim32(2);
+  const int IW = X.dim32(3);
+  const int C = X.dim32(4);
+  const int OF = IF * temporal_scale_;
+  const int OH = IH * height_scale_;
+  const int OW = IW * width_scale_;
+
+  vector<int> buffer_shape{N, OF, OH, OW, C};
+  Y->Resize(buffer_shape);
+  const T* X_data = X.template data<T>();
+  T* Y_data = Y->template mutable_data<T>();
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int n = 0; n < N; ++n) {
+    for (int t = 0; t < OF; ++t) {
+      const int in_f = std::min((int)(t / temporal_scale_), (IF - 1));
+      for (int y = 0; y < OH; ++y) {
+        const int in_y = std::min((int)(y / height_scale_), (IH - 1));
+        for (int x = 0; x < OW; ++x) {
+          const int in_x = std::min((int)(x / width_scale_), (IW - 1));
+          std::memcpy(
+              &Y_data[((((n * OF) + t) * OH + y) * OW + x) * C],
+              &X_data[((((n * IF) + in_f) * IH + in_y) * IW + in_x) * C],
+              C * sizeof(T));
+        }
+      }
+    }
+  }
+  // Even if there is a pre-chosen quantization parameters for the output,
+  // it is ignored because resize nearest output quantization should be same
+  // as the input.
+  PropagateOutputTensorQuantizationParams(this, 0, in_qparams_[0]);
+
+  return true;
+}
+
+REGISTER_CPU_OPERATOR_WITH_ENGINE(
+    Int8ResizeNearest3D,
+    DNNLOWP,
+    ResizeNearest3DDNNLowPOp<uint8_t>);
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.h b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.h
new file mode 100644
index 0000000000000..e7e09d94abe59
--- /dev/null
+++ b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "caffe2/operators/resize_3d_op.h"
+#include "caffe2/quantization/server/dnnlowp_op.h"
+
+namespace caffe2 {
+
+using ResizeNearest3DFP32Op = ResizeNearest3DOp<float, CPUContext>;
+
+template <typename T>
+class ResizeNearest3DDNNLowPOp final
+    : public DNNLowPOp<T, ResizeNearest3DFP32Op> {
+ public:
+  USE_OPERATOR_FUNCTIONS(CPUContext);
+  USE_DNNLOWP_OPERATOR_BASE_FUNCTIONS(T, ResizeNearest3DFP32Op);
+
+  ResizeNearest3DDNNLowPOp(const OperatorDef& operator_def, Workspace* ws)
+      : BaseType(operator_def, ws),
+        temporal_scale_(
+            this->template GetSingleArgument<float>("temporal_scale", 1)),
+        width_scale_(this->template GetSingleArgument<float>("width_scale", 1)),
+        height_scale_(
+            this->template GetSingleArgument<float>("height_scale", 1)) {
+    CAFFE_ENFORCE_GT(temporal_scale_, 0);
+    CAFFE_ENFORCE_GT(width_scale_, 0);
+    CAFFE_ENFORCE_GT(height_scale_, 0);
+
+    const auto& order = StringToStorageOrder(
+        this->template GetSingleArgument<std::string>("order", "NHWC"));
+    CAFFE_ENFORCE_EQ(order, StorageOrder::NHWC);
+  }
+
+  bool RunOnDevice() override;
+
+ private:
+  float temporal_scale_;
+  float width_scale_;
+  float height_scale_;
+};
+
+} // namespace caffe2
diff --git a/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
new file mode 100644
index 0000000000000..733e76cd58acd
--- /dev/null
+++ b/caffe2/quantization/server/resize_nearest_3d_dnnlowp_op_test.py
@@ -0,0 +1,62 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import caffe2.python.hypothesis_test_util as hu
+import hypothesis.strategies as st
+import numpy as np
+from caffe2.python import core, dyndep, workspace
+from hypothesis import given
+
+
+dyndep.InitOpsLibrary("//caffe2/caffe2/quantization/server:dnnlowp_ops")
+workspace.GlobalInit(["caffe2", "--caffe2_omp_num_threads=11"])
+
+
+class DNNLowPResizeNearest3DOpTest(hu.HypothesisTestCase):
+    @given(
+        N=st.integers(1, 3),
+        T=st.integers(1, 16),
+        H=st.integers(10, 300),
+        W=st.integers(10, 300),
+        C=st.integers(1, 32),
+        scale_t=st.floats(0.25, 4.0) | st.just(2.0),
+        scale_w=st.floats(0.25, 4.0) | st.just(2.0),
+        scale_h=st.floats(0.25, 4.0) | st.just(2.0),
+        **hu.gcs_cpu_only
+    )
+    def test_resize_nearest(self, N, T, H, W, C, scale_t, scale_w, scale_h, gc, dc):
+        X = np.round(np.random.rand(N, T, H, W, C) * 255).astype(np.float32)
+
+        quantize = core.CreateOperator("Quantize", ["X"], ["X_q"], engine="DNNLOWP")
+        resize_nearest = core.CreateOperator(
+            "Int8ResizeNearest3D",
+            ["X_q"],
+            ["Y_q"],
+            temporal_scale=scale_t,
+            width_scale=scale_w,
+            height_scale=scale_h,
+            engine="DNNLOWP",
+        )
+
+        net = core.Net("test_net")
+        net.Proto().op.extend([quantize, resize_nearest])
+
+        workspace.FeedBlob("X", X)
+        workspace.RunNetOnce(net)
+        X_q = workspace.FetchInt8Blob("X_q").data
+        Y_q = workspace.FetchInt8Blob("Y_q").data
+
+        def resize_nearest_ref(X):
+            outT = np.int32(T * scale_t)
+            outH = np.int32(H * scale_h)
+            outW = np.int32(W * scale_w)
+            outT_idxs, outH_idxs, outW_idxs = np.meshgrid(
+                np.arange(outT), np.arange(outH), np.arange(outW), indexing="ij"
+            )
+            inT_idxs = np.minimum(outT_idxs / scale_t, T - 1).astype(np.int32)
+            inH_idxs = np.minimum(outH_idxs / scale_h, H - 1).astype(np.int32)
+            inW_idxs = np.minimum(outW_idxs / scale_w, W - 1).astype(np.int32)
+            Y = X[:, inT_idxs, inH_idxs, inW_idxs, :]
+            return Y
+
+        Y_q_ref = resize_nearest_ref(X_q)
+        np.testing.assert_allclose(Y_q, Y_q_ref)

From cc964765a5b62900e44aa6358ff9594fb40a27cf Mon Sep 17 00:00:00 2001
From: Orion Reblitz-Richardson <orionr@gmail.com>
Date: Thu, 3 Oct 2019 17:06:04 -0700
Subject: [PATCH 032/113] Add method add_hparams to API doc (#27344)

Summary:
Adds the method `add_hparams` to `torch.utils.tensorboard` API docs. Will want to have this in PyTorch 1.3 release.

cc sanekmelnikov lanpa natalialunova
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27344

Differential Revision: D17753689

Pulled By: orionr

fbshipit-source-id: cc8636e0bdcf3f434444cd29471c62105491039d
---
 docs/source/tensorboard.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/tensorboard.rst b/docs/source/tensorboard.rst
index e2a34ba03e81e..d3205e3ba5892 100644
--- a/docs/source/tensorboard.rst
+++ b/docs/source/tensorboard.rst
@@ -91,5 +91,6 @@ Expected result:
    .. automethod:: add_pr_curve
    .. automethod:: add_custom_scalars
    .. automethod:: add_mesh
+   .. automethod:: add_hparams
    .. automethod:: flush
    .. automethod:: close

From 827a00cf63df1ddcf1c776713925c0c1ee0d7c2b Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@users.noreply.github.com>
Date: Thu, 3 Oct 2019 17:13:02 -0700
Subject: [PATCH 033/113] Support interface python assignment as an attribute
 (#26734)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26734

This PR added the python assignment for interface as an attribute in the
module, it enables any object that implicitly inheriting the specific
interface to be able to be assigned to the interface type in python.

Serialization support for interface/class assignment will be done in the
follow up PR

Test Plan: Imported from OSS

Differential Revision: D17742708

Pulled By: wanchaol

fbshipit-source-id: a0a2d8c74b60ed3fa6c05e1b0d49b7ad1abc670b
---
 test/test_jit.py                         | 48 +++++++++++++++++++++---
 torch/csrc/jit/pybind_utils.h            | 32 +++++++++++++++-
 torch/csrc/jit/python_ir.cpp             |  6 +++
 torch/csrc/jit/script/compilation_unit.h |  8 ++++
 torch/csrc/jit/script/sugared_value.h    |  2 +-
 torch/jit/__init__.py                    |  1 +
 torch/jit/_recursive.py                  |  1 +
 torch/jit/annotations.py                 |  4 +-
 8 files changed, 94 insertions(+), 8 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index c8afca1d9b24f..72b0787720201 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -19258,10 +19258,10 @@ def set_non_initialized(self, y):
                     self.bar = y  # can't assign to non-initialized attr
 
     def test_schema_human_readable(self):
-        """ 
+        """
         Make sure that the schema is human readable, ie the mode parameter should read "nearest" instead of being displayed in octal
-        aten::__interpolate(Tensor input, int? size=None, float[]? scale_factor=None, 
-        str mode='\156\145\141\162\145\163\164', bool? align_corners=None) -> (Tensor): 
+        aten::__interpolate(Tensor input, int? size=None, float[]? scale_factor=None,
+        str mode='\156\145\141\162\145\163\164', bool? align_corners=None) -> (Tensor):
         Expected a value of type 'Optional[int]' for argument 'size' but instead found type 'Tensor'.
         """
         with self.assertRaisesRegex(RuntimeError, "nearest"):
@@ -19716,8 +19716,46 @@ def wrong4(x):
                 # type: (OneTwoWrong) -> int
                 return as_interface(x)
 
-    # TODO test: interface-interface class-interface inheritance errors,
-    # NamedTuple inheritance errors
+        # Test interface/class python assignment
+        with torch.jit._disable_emit_hooks():
+            class TestPyAssign(nn.Module):
+                def __init__(self):
+                    super(TestPyAssign, self).__init__()
+                    self.proxy_mod = Foo()
+
+                def forward(self, x):
+                    return self.proxy_mod.two(x)
+
+            TestPyAssign.__annotations__ = {'proxy_mod': OneTwo}
+
+            input = torch.rand(3, 4)
+            scripted_pyassign_mod = torch.jit.script(TestPyAssign())
+            self.assertEqual(scripted_pyassign_mod(input), 2 * input)
+
+            class TestPyAssignError(nn.Module):
+                def __init__(self, obj):
+                    super(TestPyAssignError, self).__init__()
+                    self.proxy_mod = obj
+
+                def forward(self, x):
+                    return self.proxy_mod.two(x)
+
+            TestPyAssignError.__annotations__ = {'proxy_mod': OneTwoThree}
+
+            with self.assertRaisesRegex(RuntimeError,
+                                        "is not compatible with interface __torch__"):
+                torch.jit.script(TestPyAssignError(Foo()))
+
+            # test pure python object assignment to interface fails
+            class PyClass(object):
+                def __init__(self):
+                    pass
+
+            with self.assertRaisesRegex(RuntimeError,
+                                        "the value is not a TorchScript compatible type"):
+                torch.jit.script(TestPyAssignError(PyClass()))
+        # TODO test: interface-interface class-interface inheritance errors,
+        # NamedTuple inheritance errors
 
     def test_overloaded_fn(self):
         @torch.jit.script
diff --git a/torch/csrc/jit/pybind_utils.h b/torch/csrc/jit/pybind_utils.h
index 33e8b2604239a..362122e9c13fd 100644
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@@ -480,6 +480,37 @@ inline IValue toIValue(
       }
       return userObj;
     }
+    case TypeKind::InterfaceType: {
+      auto interfaceType = type->expect<InterfaceType>();
+      // When converting an pyobj to an interface, we inspect the value
+      // to found the compiled TorchScript class, check if it conform
+      // with the interface or not, and then create a ivalue::Object
+      // from that class type.
+      py::str qualified_name = py::module::import("torch.jit")
+                                   .attr("_qualified_name")(obj.get_type());
+      auto pyCu = get_python_cu();
+      const auto classType =
+          pyCu->get_class(c10::QualifiedName(qualified_name));
+      if (!classType) {
+        throw std::runtime_error(c10::str(
+            "Assigning the object ",
+            py::str(obj),
+            " to an interface fails because the value is not "
+            "a TorchScript compatible type, did you forget to",
+            "turn it into a user defined TorchScript class?"));
+      }
+      std::stringstream why_not;
+      if (!classType->isSubtypeOfExt(interfaceType, &why_not)) {
+        throw py::cast_error(c10::str(
+            "Object ",
+            py::str(obj),
+            " is not compatible with interface ",
+            interfaceType->python_str(),
+            "\n",
+            why_not.str()));
+      }
+      return toIValue(std::move(obj), classType);
+    }
     case TypeKind::NumberType: {
       if (THPDtype_Check(obj.ptr())) {
         auto dtype = reinterpret_cast<THPDtype*>(obj.ptr());
@@ -502,7 +533,6 @@ inline IValue toIValue(
     case TypeKind::GeneratorType:
     case TypeKind::VarType:
     case TypeKind::FutureType:
-    case TypeKind::InterfaceType:
       break;
     case TypeKind::FunctionType:
       AT_ERROR("Function Values aren't yet supported");
diff --git a/torch/csrc/jit/python_ir.cpp b/torch/csrc/jit/python_ir.cpp
index 201ec0f2bda28..921e24f93ecb0 100644
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@@ -722,6 +722,12 @@ void initPythonIRBindings(PyObject* module_) {
       .def(py::init([](const std::string& qualified_name) {
         return get_python_cu()->get_class(c10::QualifiedName(qualified_name));
       }));
+  py::class_<InterfaceType, Type, std::shared_ptr<InterfaceType>>(
+      m, "InterfaceType")
+      .def(py::init([](const std::string& qualified_name) {
+        return get_python_cu()->get_interface(
+            c10::QualifiedName(qualified_name));
+      }));
 
   py::class_<Use>(m, "Use")
       .def_readonly("user", &Use::user)
diff --git a/torch/csrc/jit/script/compilation_unit.h b/torch/csrc/jit/script/compilation_unit.h
index 06c3727e25622..9e1ce88a3c72b 100644
--- a/torch/csrc/jit/script/compilation_unit.h
+++ b/torch/csrc/jit/script/compilation_unit.h
@@ -176,6 +176,14 @@ struct TORCH_API CompilationUnit {
     return type->cast<c10::ClassType>();
   }
 
+  c10::InterfaceTypePtr get_interface(const c10::QualifiedName& name) const {
+    auto type = get_type(name);
+    if (!type) {
+      return nullptr;
+    }
+    return type->cast<c10::InterfaceType>();
+  }
+
   c10::TupleTypePtr get_named_tuple(const c10::QualifiedName& name) const {
     for (const auto& cls : classes_) {
       if (cls->name()->qualifiedName() == name.qualifiedName()) {
diff --git a/torch/csrc/jit/script/sugared_value.h b/torch/csrc/jit/script/sugared_value.h
index 2e1fbe92cf0dd..b838b0de1f91c 100644
--- a/torch/csrc/jit/script/sugared_value.h
+++ b/torch/csrc/jit/script/sugared_value.h
@@ -283,7 +283,7 @@ struct TORCH_API ClosureValue : public SugaredValue {
   Value* value_;
 };
 
-// defines how a method obtained from a module behaves in script
+// defines how a method obtained from a module/class/interface behaves in script
 struct MethodValue : public SugaredValue {
   MethodValue(Value* self, std::string method_name)
       : self_(std::move(self)), method_name_(std::move(method_name)) {}
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 2a47205e1a3bf..62c3835c1251f 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -1257,6 +1257,7 @@ def interface(obj):
     ast = get_jit_class_def(obj, obj.__name__)
     rcb = _jit_internal.createResolutionCallback(1)
     torch._C._jit_script_interface_compile(qualified_name, ast, rcb)
+    obj.__torch_script_interface__ = True
     return obj
 
 ScriptMethodStub = namedtuple('ScriptMethodStub', ('resolution_callback', 'def_', 'original_method'))
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index e888137e192ec..c9706a2242046 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -70,6 +70,7 @@ def copy_to_script_module(original, stubs):
             the_type = torch.jit.annotations.ann_to_type(class_annotations[name])
         else:
             the_type = torch._C._jit_try_infer_type(item)
+
         if the_type is not None:
             try:
                 script_module._c._register_attribute(name, the_type, item)
diff --git a/torch/jit/annotations.py b/torch/jit/annotations.py
index 0a5283ed4cf0f..8096f5150e6c1 100644
--- a/torch/jit/annotations.py
+++ b/torch/jit/annotations.py
@@ -7,7 +7,7 @@
     BroadcastingList3, Tuple, is_tuple, is_list, Dict, is_dict, Optional, \
     is_optional, _qualified_name
 from torch._C import TensorType, TupleType, FloatType, IntType, \
-    ListType, StringType, DictType, BoolType, OptionalType, ClassType
+    ListType, StringType, DictType, BoolType, OptionalType, ClassType, InterfaceType
 from textwrap import dedent
 from torch._six import builtins
 from torch._utils_internal import get_source_lines_and_file
@@ -249,6 +249,8 @@ def ann_to_type(ann, resolver=None):
         return BoolType.get()
     elif hasattr(ann, "__torch_script_class__"):
         return ClassType(_qualified_name(ann))
+    elif hasattr(ann, "__torch_script_interface__"):
+        return InterfaceType(_qualified_name(ann))
     elif resolver is not None:
         # Maybe resolve a NamedTuple to a Tuple Type
         rcb, loc = resolver

From 8fe5dcf6996a091731993d9100ddde0f760cd562 Mon Sep 17 00:00:00 2001
From: davidriazati <davidriazati@fb.com>
Date: Thu, 3 Oct 2019 17:13:52 -0700
Subject: [PATCH 034/113] Skip tests that use numpy if it's not present

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27165

Pulled By: driazati

Differential Revision: D17695078

fbshipit-source-id: d25c920f4c43285028537f88761d47a2c9db7b8f
---
 test/common_utils.py | 20 ++++++++++++++++++++
 test/test_jit.py     |  5 ++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/test/common_utils.py b/test/common_utils.py
index ef1342453670d..9a4e0c00561eb 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -208,6 +208,26 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def skipIfCompiledWithoutNumpy(fn):
+    # Even if the numpy module is present, if `USE_NUMPY=0` is used during the
+    # build, numpy tests will fail
+    numpy_support = TEST_NUMPY
+    if numpy_support:
+        try:
+            # The numpy module is present, verify that PyTorch is compiled with
+            # numpy support
+            torch.from_numpy(numpy.array([2, 2]))
+        except RuntimeError:
+            numpy_support = False
+
+    @wraps(fn)
+    def wrapper(*args, **kwargs):
+        if not numpy_support:
+            raise unittest.SkipTest("PyTorch was compiled without numpy support")
+        else:
+            fn(*args, **kwargs)
+    return wrapper
+
 
 def _test_function(fn, device):
     def run_test_function(self):
diff --git a/test/test_jit.py b/test/test_jit.py
index 72b0787720201..4dc470ae391fa 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -26,7 +26,7 @@
 # Testing utils
 from common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
     skipIfRocm, skipIfNoLapack, suppress_warnings, load_tests, IS_SANDCASTLE, \
-    freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName
+    freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy
 from jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
     _trace, enable_cpu_fuser_if, enable_profiling_mode, do_input_map, \
     execWrapper, _inline_everything, _tmp_donotuse_dont_inline_everything, \
@@ -1793,6 +1793,7 @@ def to_tensor(x, y):
         x, y = torch.randn(2, 2), torch.randn(1, 10)
         self.assertEqual(to_tensor_trace(x, y), to_tensor(x, y))
 
+    @skipIfCompiledWithoutNumpy
     def test_trace_warn(self):
         def fn(x):
             int(x)  # Warning 1.
@@ -8812,6 +8813,7 @@ def forward(self, rep):
             m = M2()
             m(torch.zeros(4, 3))
 
+    @skipIfCompiledWithoutNumpy
     def test_pack_padded_pad_packed_trace(self):
         from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
         T, B, C = 3, 5, 7
@@ -8932,6 +8934,7 @@ def foo(a):
         v = torch.rand(10, 3)
         self.assertEqual(torch.chunk(v, dim=0, chunks=2)[0], foo(v))
 
+    @skipIfCompiledWithoutNumpy
     def test_rnn_trace_override(self):
         from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
         num_layers = 3

From 2486b0ba829d1343014089cd9837e90fa3f62b1f Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Thu, 3 Oct 2019 17:45:30 -0700
Subject: [PATCH 035/113] Add Python RRef as args and return value (#25499)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/25499

See #23110 for model parallel design details, and #26759 for the RRef
protocol. This commit add support for using RRef as Python UDF arguments
and return value. RRefs can now be shared from owner to user, from user to
owner, or from user to user.

Limitations:
1. No implicit type conversion yet. (#27099)
2. No failure handling and retry. (#26116)
3. UDF is not yet blocked until all RRefs are confirmed. (#27098)
4. Internal RRef control messages are not idempotent yet. (#26116)
5. Cannot delete RRefs correctly when there are circular dependencies. (#27096)

Main changes:

1. Added `SCRIPT_REMOTE_CALL` and `PYTHON_REMOTE_CALL` to `Message.h` to represent `dist.remote` invocations.
2. Added `SCRIPT_RREF_FETCH_CALL`, `PYTHON_RREF_FETCH_CALL`, `RREF_USER_ACCEPT`, `RREF_USER_DELETE`, `RREF_CHILD_ACCEPT`, and `RREF_FORK_REQUEST` to `Message.h` as internal RRef control messages.
3. New message request handling code is added to `functions.cpp`, and message format is added in `script_remote_call.h`, `python_remote_call.h`, and `rref_proto.h`.
4. Added a `PyRRef` type in `py_rref.h` and `py_rref.cpp` which holds a shared pointer to C++ `RRef` type. `PyRRef` wraps the C++ API and also implements RRef pickling and unpickling. RRef fork related control messages will be sent during RRef pickling/unpickling procedure.
5.  Update `RRef.h` and `RRef.cpp` accordingly to support `py::object` RRefs.
6. RRef context (reference count, etc.) are tracked in `rref_context.h` and `rref_context.cpp`.

Test Plan:
Imported from OSS

buck test mode/dev-nosan //caffe2/test:rpc_fork

Differential Revision: D17184146

Pulled By: mrshenli

fbshipit-source-id: a3a268efc087ac1ef489136ab957080382629265
---
 caffe2/CMakeLists.txt                         |   4 +-
 test/rpc_test.py                              | 384 ++++++++++++++++--
 tools/build_variables.py                      |   6 +-
 torch/CMakeLists.txt                          |   2 +-
 torch/csrc/distributed/rpc/future_message.cpp |   1 -
 torch/csrc/distributed/rpc/init.cpp           |  72 +++-
 torch/csrc/distributed/rpc/message.cpp        |  34 +-
 torch/csrc/distributed/rpc/message.h          |  43 +-
 .../distributed/rpc/process_group_agent.cpp   |  68 ++--
 .../distributed/rpc/process_group_agent.h     |  21 +-
 torch/csrc/distributed/rpc/py_rref.cpp        | 135 ++++++
 torch/csrc/distributed/rpc/py_rref.h          |  30 ++
 .../csrc/distributed/rpc/python_functions.cpp |  72 +++-
 torch/csrc/distributed/rpc/python_functions.h |  25 +-
 .../distributed/rpc/python_remote_call.cpp    |  53 +++
 .../csrc/distributed/rpc/python_remote_call.h |  43 ++
 .../distributed/rpc/python_rpc_handler.cpp    |  24 +-
 .../csrc/distributed/rpc/python_rpc_handler.h |   8 +
 .../distributed/rpc/request_callback_impl.cpp |  72 +++-
 torch/csrc/distributed/rpc/rpc_agent.cpp      |  10 +-
 torch/csrc/distributed/rpc/rpc_agent.h        |  32 +-
 torch/csrc/distributed/rpc/rref.cpp           | 176 +++++---
 torch/csrc/distributed/rpc/rref.h             | 251 ++++++++++--
 torch/csrc/distributed/rpc/rref_context.cpp   | 308 ++++++++++++--
 torch/csrc/distributed/rpc/rref_context.h     | 151 ++++---
 torch/csrc/distributed/rpc/rref_proto.cpp     | 173 ++++++++
 torch/csrc/distributed/rpc/rref_proto.h       | 137 +++++++
 .../distributed/rpc/script_remote_call.cpp    |  32 +-
 .../csrc/distributed/rpc/script_remote_call.h |  25 +-
 torch/csrc/distributed/rpc/types.cpp          |  51 ++-
 torch/csrc/distributed/rpc/types.h            |  20 +-
 torch/csrc/distributed/rpc/utils.cpp          |  31 +-
 torch/distributed/__init__.py                 |   2 +-
 torch/distributed/internal_rpc_utils.py       |   5 +-
 torch/distributed/rpc_api.py                  |  54 ++-
 35 files changed, 2098 insertions(+), 457 deletions(-)
 create mode 100644 torch/csrc/distributed/rpc/py_rref.cpp
 create mode 100644 torch/csrc/distributed/rpc/py_rref.h
 create mode 100644 torch/csrc/distributed/rpc/python_remote_call.cpp
 create mode 100644 torch/csrc/distributed/rpc/python_remote_call.h
 create mode 100644 torch/csrc/distributed/rpc/rref_proto.cpp
 create mode 100644 torch/csrc/distributed/rpc/rref_proto.h

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 1f40502b7146e..b2b8e9358fb54 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -488,14 +488,16 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/distributed/autograd/utils.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/future_message.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/message.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_remote_call.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_udf_call.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_udf_resp.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/request_callback.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/rpc_with_autograd.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/rref_proto.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_call.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_remote_call.cpp
-      ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_rref_proto.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/script_resp.cpp
+      ${TORCH_SRC_DIR}/csrc/distributed/rpc/types.cpp
       ${TORCH_SRC_DIR}/csrc/distributed/rpc/utils.cpp
       ${TORCH_SRC_DIR}/csrc/jit/export.cpp
       ${TORCH_SRC_DIR}/csrc/jit/import_legacy.cpp
diff --git a/test/rpc_test.py b/test/rpc_test.py
index 2b6f6414c4344..f7cefdcb62270 100644
--- a/test/rpc_test.py
+++ b/test/rpc_test.py
@@ -10,11 +10,20 @@
 import torch.distributed as dist
 import torch.distributed.rpc_backend_registry as rpc_backend_registry
 from collections import namedtuple
-from torch.distributed.internal_rpc_utils import _internal_rpc_pickler, PythonUDF
-
 from common_utils import load_tests
-from torch.distributed.rpc_api import RpcBackend
 from dist_utils import dist_init
+from torch.distributed import ProcessGroupAgent
+from torch.distributed.internal_rpc_utils import _internal_rpc_pickler, PythonUDF
+from torch.distributed.rpc_api import RpcBackend
+
+
+def requires_process_group_agent(func):
+    from torch.distributed.rpc_api import _agent
+    return unittest.skipUnless(
+        isinstance(_agent, ProcessGroupAgent),
+        "Only ProcessGroupAgent supports global termination detection",
+    )
+
 
 BACKEND = getenv("RPC_BACKEND", RpcBackend.PROCESS_GROUP)
 RPC_INIT_URL = getenv("RPC_INIT_URL", "")
@@ -97,6 +106,10 @@ def my_complex_tensor_function(list_input, tensor_class_input, dict_input):
     return (res, complex_tensors[0], complex_tensors[1], complex_tensors[2])
 
 
+def my_rref_function(rref_a, rref_b):
+    return rref_a.to_here() + rref_b.to_here()
+
+
 def no_result():
     print("do nothing")
 
@@ -110,18 +123,52 @@ def multi_layer_nested_async_rpc(dst, world_size, ttl):
     if ttl > 0:
         current_dst = "worker{}".format(dst)
         next_dst = (dst + 1) % world_size
-        dist.rpc(
+        dist.rpc_async(
             current_dst,
             multi_layer_nested_async_rpc,
             args=(
                 next_dst,
                 world_size,
                 ttl - 1
-            ),
-            async_call=True
+            )
         )
         return 0
 
+def nested_rref(dst):
+    return (
+        dist.remote(dst, torch.add, args=(torch.ones(2, 2), 1)),
+        dist.remote(dst, torch.add, args=(torch.ones(2, 2), 2))
+    )
+
+
+def nested_remote(dst):
+    rref = dist.remote(dst, torch.add, args=(torch.ones(2, 2), 3))
+    return rref.to_here()
+
+
+def rref_forward_chain(dst, world_size, rref, ttl):
+    if ttl > 0:
+        current_dst = "worker{}".format(dst)
+        next_dst = (dst + 1) % world_size
+        ret_rref = dist.remote(
+            current_dst,
+            rref_forward_chain,
+            args=(
+                next_dst,
+                world_size,
+                rref,
+                ttl - 1
+            )
+        )
+        return [ret_rref]
+    else:
+        return rref.to_here()
+
+
+def rpc_return_rref(dst):
+    return dist.remote(dst, torch.add, args=(torch.ones(2, 2), 1))
+
+
 def light_rpc():
     return 0
 
@@ -155,32 +202,40 @@ def world_size(self):
     def test_worker_id(self):
         n = self.rank + 1
         peer_rank = n % self.world_size
-        self_worker_id = dist.get_worker_id()
-        peer_worker_id = dist.get_worker_id("worker{}".format(peer_rank))
+        self_worker_info = dist.get_worker_info()
+        peer_worker_info = dist.get_worker_info("worker{}".format(peer_rank))
 
-        self.assertEqual(self_worker_id.name, "worker{}".format(self.rank))
-        self.assertEqual(peer_worker_id.name, "worker{}".format(peer_rank))
+        self.assertEqual(self_worker_info.name, "worker{}".format(self.rank))
+        self.assertEqual(peer_worker_info.name, "worker{}".format(peer_rank))
 
         with self.assertRaisesRegex(RuntimeError, "Unknown destination worker"):
-            unknown_worker_id = dist.get_worker_id("WorkerUnknown")
+            unknown_worker_id = dist.get_worker_info("WorkerUnknown")
 
     @dist_init
     def test_self_add(self):
-        self_worker_id = dist.get_worker_id()
+        self_worker_info = dist.get_worker_info()
         self_worker_name = "worker{}".format(self.rank)
 
         with self.assertRaisesRegex(
             RuntimeError, "does not support making RPC calls to self"
         ):
-            dist.rpc_sync(self_worker_id, torch.add, args=(torch.ones(2, 2), 1))
+            dist.rpc_sync(
+                self_worker_info,
+                torch.add,
+                args=(torch.ones(2, 2), 1)
+            )
 
         with self.assertRaisesRegex(
             RuntimeError, "does not support making RPC calls to self"
         ):
-            dist.rpc_sync(self_worker_name, torch.add, args=(torch.ones(2, 2), 1))
+            dist.rpc_sync(
+                self_worker_name,
+                torch.add,
+                args=(torch.ones(2, 2), 1)
+            )
 
     @mock.patch.object(torch.distributed.autograd, "_init")
-    @mock.patch.object(torch.distributed.rpc_api, "init_rref_context")
+    @mock.patch.object(torch.distributed.rpc_api, "_init_rref_context")
     def test_register_rpc_backend_and_init_rpc_backend(
         self, mock_init_rref_context, mock_dist_autograd_init
     ):
@@ -254,7 +309,7 @@ def test_invalid_names(self):
             dist.init_model_parallel(self_name="")
 
         # If the number in the message does not match, it is likely that the
-        # value of MAX_NAME_LEN in RPC WorkerId has changed.
+        # value of MAX_NAME_LEN in RPC WorkerInfo has changed.
         with self.assertRaisesRegex(RuntimeError, "shorter than 128"):
             dist.init_model_parallel(
                 self_name="".join(["a" for _ in range(500)]),
@@ -279,10 +334,10 @@ def test_add(self):
     def test_add_with_id(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
-        workder_id = dist.get_worker_id("worker{}".format(dst_rank))
+        workder_info = dist.get_worker_info("worker{}".format(dst_rank))
 
         ret = dist.rpc_sync(
-            workder_id, torch.add, args=(torch.ones(n, n), torch.ones(n, n))
+            workder_info, torch.add, args=(torch.ones(n, n), torch.ones(n, n))
         )
         self.assertEqual(ret, torch.ones(n, n) * 2)
 
@@ -424,9 +479,17 @@ def test_py_class_static_method(self):
     def test_py_multi_async_call(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
-        dst_worker_id = dist.get_worker_id("worker{}".format(dst_rank))
-        fut1 = dist.rpc_async(dst_worker_id, MyClass.my_static_method, args=(n + 10,))
-        fut2 = dist.rpc_async(dst_worker_id, min, args=(n, n + 1, n + 2))
+        dst_worker_info = dist.get_worker_info("worker{}".format(dst_rank))
+        fut1 = dist.rpc_async(
+            dst_worker_info,
+            MyClass.my_static_method,
+            args=(n + 10,)
+        )
+        fut2 = dist.rpc_async(
+            dst_worker_info,
+            min,
+            args=(n, n + 1, n + 2)
+        )
         self.assertEqual(fut1.wait(), MyClass.my_static_method(n + 10))
         self.assertEqual(fut2.wait(), min(n, n + 1, n + 2))
 
@@ -441,9 +504,11 @@ def test_py_no_return_result(self):
     def test_py_tensors(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
-        ret = dist.rpc("worker{}".format(dst_rank),
-                       my_tensor_function,
-                       args=(torch.ones(n, n), torch.ones(n, n)))
+        ret = dist.rpc_sync(
+            "worker{}".format(dst_rank),
+            my_tensor_function,
+            args=(torch.ones(n, n), torch.ones(n, n))
+        )
         self.assertEqual(ret,
                          my_tensor_function(torch.ones(n, n),
                                             torch.ones(n, n)))
@@ -454,10 +519,11 @@ def test_py_tensors_multi_async_call(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         for i in range(100):
-            fut = dist.rpc("worker{}".format(dst_rank),
-                           my_tensor_function,
-                           args=(torch.ones(i, i), torch.ones(i, i)),
-                           async_call=True)
+            fut = dist.rpc_async(
+                "worker{}".format(dst_rank),
+                my_tensor_function,
+                args=(torch.ones(i, i), torch.ones(i, i))
+            )
             futs.append(fut)
 
         j = 0
@@ -474,9 +540,11 @@ def test_py_tensors_in_container(self):
         a = [torch.ones(n, n), torch.ones(n, n)]
         b = TensorClass(build_complex_tensors())
         c = {"foo": torch.ones(n, n), "bar": torch.ones(n, n)}
-        ret = dist.rpc("worker{}".format(dst_rank),
-                       my_complex_tensor_function,
-                       args=(a, b, c))
+        ret = dist.rpc_sync(
+            "worker{}".format(dst_rank),
+            my_complex_tensor_function,
+            args=(a, b, c)
+        )
         self.assertEqual(ret, my_complex_tensor_function(a, b, c))
 
     @dist_init
@@ -484,9 +552,11 @@ def test_py_nested_pickle(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
 
-        ret = dist.rpc("worker{}".format(dst_rank),
-                       run_nested_pickle,
-                       args=(MyPickleClass(), torch.ones(2, 2)))
+        ret = dist.rpc_sync(
+            "worker{}".format(dst_rank),
+            run_nested_pickle,
+            args=(MyPickleClass(), torch.ones(2, 2))
+        )
 
         m = MyPickleClass()
         m.set(my_tensor_function(torch.ones(2, 2), torch.ones(2, 2)))
@@ -497,7 +567,11 @@ def test_py_function_exception(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         with self.assertRaisesRegex(Exception, "TypeError"):
-            ret = dist.rpc_sync("worker{}".format(dst_rank), no_result, args=(10,))
+            ret = dist.rpc_sync(
+                "worker{}".format(dst_rank),
+                no_result,
+                args=(10,)
+            )
 
     @dist_init
     def test_py_raise_in_user_func(self):
@@ -557,8 +631,9 @@ def test_builtin_remote_ret(self):
         )
         self.assertEqual(rref.to_here(), torch.ones(n, n) * 2)
 
-    @dist_init
-    def test_multi_builtin_remote_ret(self):
+    def _test_multi_remote_call(
+        self, fn, args_fn=lambda x: (), kwargs_fn=lambda x: {}
+    ):
         m = 10
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -566,19 +641,171 @@ def test_multi_builtin_remote_ret(self):
         expected = []
         for i in range(m):
             n = n + i
-            rrefs.append(
+            rrefs.append(dist.remote(
+                'worker{}'.format(dst_rank),
+                fn,
+                args=args_fn(n),
+                kwargs=kwargs_fn(n)
+            ))
+            expected.append(fn(*args_fn(n), **kwargs_fn(n)))
+
+        for i in range(m):
+            self.assertEqual(rrefs[i].to_here(), expected[i])
+
+    @dist_init
+    @requires_process_group_agent
+    def test_multi_builtin_remote_ret(self):
+        def args_fn(n):
+            return (torch.ones(n, n), torch.ones(n, n))
+        self._test_multi_remote_call(torch.add, args_fn=args_fn)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_py_udf_remote(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref = dist.remote(
+            "worker{}".format(dst_rank),
+            my_function,
+            kwargs={"a": n, "b": n + 1, "c": n + 2},
+        )
+        self.assertEqual(rref.to_here(), my_function(n, n + 1, n + 2))
+
+    @dist_init
+    @requires_process_group_agent
+    def test_multi_py_udf_remote(self):
+        def kwargs_fn(n):
+            return {
+                "a" : torch.ones(n, n),
+                "b" : torch.ones(n, n),
+                "c" : torch.ones(n, n),
+            }
+        self._test_multi_remote_call(my_function, kwargs_fn=kwargs_fn)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_py_rref_args(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_a = dist.remote(
+            'worker{}'.format(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), 2)
+        )
+        rref_b = dist.remote(
+            'worker{}'.format(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), 1)
+        )
+        rref_c = dist.remote(
+            'worker{}'.format(dst_rank),
+            my_rref_function,
+            args=(rref_a, rref_b)
+        )
+        self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_py_rref_args_user_share(self):
+        n = self.rank + 1
+        owner_rank = n % self.world_size
+        user_rank = (n + 1) % self.world_size
+        rref_a = dist.remote(
+            'worker{}'.format(owner_rank),
+            my_function,
+            args=(torch.ones(n, n), 2, 0)
+        )
+        rref_b = dist.remote(
+            'worker{}'.format(owner_rank),
+            my_function,
+            args=(torch.ones(n, n), 1, 0)
+        )
+        rref_c = dist.remote(
+            'worker{}'.format(user_rank),
+            my_rref_function,
+            args=(rref_a, rref_b)
+        )
+        self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
+
+
+    @dist_init
+    @requires_process_group_agent
+    def test_py_rpc_rref_args(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_a = dist.remote(
+            "worker{}".format(dst_rank),
+            my_function,
+            args=(torch.ones(n, n), 2, 0)
+        )
+        rref_b = dist.remote(
+            "worker{}".format(dst_rank),
+            my_function,
+            args=(torch.ones(n, n), 1, 0)
+        )
+
+        c = dist.rpc_sync(
+            "worker{}".format(dst_rank),
+            my_rref_function,
+            args=(rref_a, rref_b)
+        )
+
+        self.assertEqual(c, torch.ones(n, n) + 4)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_nested_remote(self):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+        rref = dist.remote(
+            "worker{}".format(dst_rank1),
+            nested_remote,
+            args=("worker{}".format(dst_rank2),)
+        )
+        self.assertEqual(rref.to_here(), torch.ones(2, 2) + 3)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_nested_rref(self):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+        rref_of_rrefs = dist.remote(
+            "worker{}".format(dst_rank1),
+            nested_rref,
+            args=("worker{}".format(dst_rank2),)
+        )
+        rrefs = rref_of_rrefs.to_here()
+        self.assertEqual(len(rrefs), 2)
+        self.assertEqual(rrefs[0].to_here(), torch.ones(2, 2) + 1)
+        self.assertEqual(rrefs[1].to_here(), torch.ones(2, 2) + 2)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_nested_rref_stress(self):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+        all_rrefs = []
+        for _ in range(20):
+            all_rrefs.append(
                 dist.remote(
-                    "worker{}".format(dst_rank),
-                    torch.add,
-                    args=(torch.ones(n, n), torch.ones(n, n)),
+                    "worker{}".format(dst_rank1),
+                    nested_rref,
+                    args=("worker{}".format(dst_rank2),)
                 )
             )
-            expected.append(torch.ones(n, n) * 2)
 
-        for i in range(m):
-            self.assertEqual(rrefs[i].to_here(), expected[i])
+        for i in range(20):
+            rref_of_rrefs = all_rrefs[i]
+            rrefs = rref_of_rrefs.to_here()
+            self.assertEqual(len(rrefs), 2)
+            self.assertEqual(rrefs[0].to_here(), torch.ones(2, 2) + 1)
+            self.assertEqual(rrefs[1].to_here(), torch.ones(2, 2) + 2)
 
     @dist_init
+    @requires_process_group_agent
     def test_multi_layer_nested_async_rpc(self):
         # This test will exit right away, but there will be a chain of async
         # RPCs. The termination algorithm should detect those messages properly.
@@ -589,3 +816,72 @@ def test_multi_layer_nested_async_rpc(self):
         dst_rank = n % self.world_size
 
         multi_layer_nested_async_rpc(dst_rank, self.world_size, ttl)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_remote_with_exception(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref = dist.remote(
+            "worker{}".format(dst_rank),
+            raise_func
+        )
+        with self.assertRaisesRegex(Exception, "ValueError"):
+            rref.to_here()
+
+    @dist_init
+    @requires_process_group_agent
+    def test_rpc_return_rref(self):
+        n = self.rank + 1
+        dst_rank1 = n % self.world_size
+        dst_rank2 = (n + 1) % self.world_size
+        rref = dist.rpc_sync(
+            "worker{}".format(dst_rank1),
+            rpc_return_rref,
+            args=("worker{}".format(dst_rank2),)
+        )
+        self.assertEqual(rref.to_here(), torch.ones(2, 2) + 1)
+
+    @dist_init
+    @requires_process_group_agent
+    def test_rref_forward_chain(self):
+        ttl = 8
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+
+        rref = dist.remote(
+            "worker{}".format(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), 1)
+        )
+
+        ret_rref = rref_forward_chain(dst_rank, self.world_size, rref, ttl)
+
+        for i in range(ttl):
+            self.assertEqual(len(ret_rref), 1)
+            ret_rref = ret_rref[0].to_here()
+
+        ret = ret_rref
+        self.assertEqual(ret, torch.add(torch.ones(n, n), 1))
+
+    @dist_init
+    @requires_process_group_agent
+    def test_remote_same_worker(self):
+        n = self.rank + 1
+        dst_rank = n % self.world_size
+        rref_a = dist.remote(
+            "worker{}".format(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), 2)
+        )
+        rref_b = dist.remote(
+            "worker{}".format(dst_rank),
+            torch.add,
+            args=(torch.ones(n, n), 1)
+        )
+        rref_c = dist.remote(
+            "worker{}".format(dst_rank),
+            my_rref_function,
+            args=(rref_a, rref_b)
+        )
+        self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
diff --git a/tools/build_variables.py b/tools/build_variables.py
index b68094e554f1c..c5d47b34ef820 100644
--- a/tools/build_variables.py
+++ b/tools/build_variables.py
@@ -56,14 +56,16 @@
     "torch/csrc/distributed/autograd/functions/sendrpc_backward.cpp",
     "torch/csrc/distributed/rpc/future_message.cpp",
     "torch/csrc/distributed/rpc/message.cpp",
+    "torch/csrc/distributed/rpc/python_remote_call.cpp",
     "torch/csrc/distributed/rpc/python_udf_call.cpp",
     "torch/csrc/distributed/rpc/python_udf_resp.cpp",
     "torch/csrc/distributed/rpc/request_callback.cpp",
     "torch/csrc/distributed/rpc/rpc_with_autograd.cpp",
+    "torch/csrc/distributed/rpc/rref_proto.cpp",
     "torch/csrc/distributed/rpc/script_call.cpp",
     "torch/csrc/distributed/rpc/script_remote_call.cpp",
-    "torch/csrc/distributed/rpc/script_rref_proto.cpp",
     "torch/csrc/distributed/rpc/script_resp.cpp",
+    "torch/csrc/distributed/rpc/types.cpp",
     "torch/csrc/distributed/rpc/utils.cpp",
     "torch/csrc/Exceptions.cpp",
     "torch/csrc/jit/autodiff.cpp",
@@ -269,13 +271,13 @@ def add_torch_libs():
         "torch/csrc/distributed/c10d/reducer.cpp",
         "torch/csrc/distributed/rpc/init.cpp",
         "torch/csrc/distributed/rpc/process_group_agent.cpp",
+        "torch/csrc/distributed/rpc/py_rref.cpp",
         "torch/csrc/distributed/rpc/python_functions.cpp",
         "torch/csrc/distributed/rpc/python_rpc_handler.cpp",
         "torch/csrc/distributed/rpc/request_callback_impl.cpp",
         "torch/csrc/distributed/rpc/rpc_agent.cpp",
         "torch/csrc/distributed/rpc/rref.cpp",
         "torch/csrc/distributed/rpc/rref_context.cpp",
-        "torch/csrc/distributed/rpc/types.cpp",
         "torch/csrc/jit/init.cpp",
         "torch/csrc/jit/passes/inline_fork_wait.cpp",
         "torch/csrc/jit/passes/onnx.cpp",
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 19eeecd0bb739..43481ee687064 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -229,13 +229,13 @@ if (USE_DISTRIBUTED)
         ${TORCH_SRC_DIR}/csrc/distributed/c10d/reducer.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/init.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/process_group_agent.cpp
+        ${TORCH_SRC_DIR}/csrc/distributed/rpc/py_rref.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_functions.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/python_rpc_handler.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/request_callback_impl.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/rpc_agent.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/rref_context.cpp
         ${TORCH_SRC_DIR}/csrc/distributed/rpc/rref.cpp
-        ${TORCH_SRC_DIR}/csrc/distributed/rpc/types.cpp
         )
       list(APPEND TORCH_PYTHON_LINK_LIBRARIES c10d)
       list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_C10D)
diff --git a/torch/csrc/distributed/rpc/future_message.cpp b/torch/csrc/distributed/rpc/future_message.cpp
index c8030708ac7c9..92aef393a5b99 100644
--- a/torch/csrc/distributed/rpc/future_message.cpp
+++ b/torch/csrc/distributed/rpc/future_message.cpp
@@ -7,7 +7,6 @@ namespace rpc {
 const Message& FutureMessage::wait() {
   std::unique_lock<std::mutex> lock(mutex_);
   finished_cv_.wait(lock, [this] { return completed_.load(); });
-
   return message_;
 }
 
diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp
index e70e89a3ff653..8448ad3436470 100644
--- a/torch/csrc/distributed/rpc/init.cpp
+++ b/torch/csrc/distributed/rpc/init.cpp
@@ -2,6 +2,7 @@
 
 #include <torch/csrc/distributed/rpc/future_message.h>
 #include <torch/csrc/distributed/rpc/process_group_agent.h>
+#include <torch/csrc/distributed/rpc/py_rref.h>
 #include <torch/csrc/distributed/rpc/python_functions.h>
 #include <torch/csrc/distributed/rpc/rpc_agent.h>
 #include <torch/csrc/distributed/rpc/rref.h>
@@ -29,9 +30,9 @@ PyObject* rpc_init(PyObject* /* unused */) {
 
   auto module = py::handle(dist_module).cast<py::module>();
 
-  auto workerId = shared_ptr_class_<WorkerId>(module, "WorkerId")
-                      .def_readonly("name", &WorkerId::name_)
-                      .def_readonly("id", &WorkerId::id_);
+  auto workerInfo = shared_ptr_class_<WorkerInfo>(module, "WorkerInfo")
+                        .def_readonly("name", &WorkerInfo::name_)
+                        .def_readonly("id", &WorkerInfo::id_);
 
   auto rpcAgent =
       shared_ptr_class_<RpcAgent>(module, "RpcAgent")
@@ -42,13 +43,33 @@ PyObject* rpc_init(PyObject* /* unused */) {
               &RpcAgent::sync,
               py::call_guard<py::gil_scoped_release>());
 
-  auto rref =
-      shared_ptr_class_<RRef>(module, "RRef")
-          .def("owner", &RRef::owner, py::call_guard<py::gil_scoped_release>())
+  auto pyRRef =
+      shared_ptr_class_<PyRRef>(module, "RRef")
+          .def(
+              // not releasing GIL here to avoid context switch on getters
+              "is_owner",
+              &PyRRef::isOwner)
+          .def(
+              // not releasing GIL here to avoid context switch on getters
+              "owner",
+              &PyRRef::owner)
           .def(
               "to_here",
-              [&](RRef& rref) { return torch::jit::toPyObject(rref.toHere()); },
-              py::call_guard<py::gil_scoped_release>());
+              &PyRRef::toHere,
+              py::call_guard<py::gil_scoped_release>())
+          .def(
+              "local_value",
+              &PyRRef::localValue,
+              py::call_guard<py::gil_scoped_release>())
+          .def(py::pickle(
+              [](const PyRRef& self) {
+                // __getstate__
+                return self.pickle();
+              },
+              [](py::tuple t) { // NOLINT
+                // __setstate__
+                return PyRRef::unpickle(t);
+              }));
 
   auto futureMessage =
       shared_ptr_class_<FutureMessage>(module, "FutureMessage")
@@ -64,14 +85,14 @@ PyObject* rpc_init(PyObject* /* unused */) {
           py::arg("process_group"),
           py::arg("num_send_recv_threads") = 4)
       .def(
-          "get_worker_id",
-          (const WorkerId& (ProcessGroupAgent::*)(void)const) &
-              RpcAgent::getWorkerId,
+          "get_worker_info",
+          (const WorkerInfo& (ProcessGroupAgent::*)(void)const) &
+              RpcAgent::getWorkerInfo,
           py::call_guard<py::gil_scoped_release>())
       .def(
-          "get_worker_id",
-          (const WorkerId& (ProcessGroupAgent::*)(const std::string&)const) &
-              ProcessGroupAgent::getWorkerId,
+          "get_worker_info",
+          (const WorkerInfo& (ProcessGroupAgent::*)(const std::string&)const) &
+              ProcessGroupAgent::getWorkerInfo,
           py::call_guard<py::gil_scoped_release>())
       .def(
           "join",
@@ -82,14 +103,18 @@ PyObject* rpc_init(PyObject* /* unused */) {
           &ProcessGroupAgent::sync,
           py::call_guard<py::gil_scoped_release>());
 
-  module.def("init_rref_context", [](std::shared_ptr<RpcAgent> agent) {
+  module.def("_init_rref_context", [](std::shared_ptr<RpcAgent> agent) {
     RRefContext::initInstance(std::move(agent));
   });
 
+  module.def("_destroy_rref_context", []() {
+    RRefContext::getInstance()->destroyInstance();
+  });
+
   module.def(
       "invoke_rpc_builtin",
       [](RpcAgent& agent,
-         const WorkerId& dst,
+         const WorkerInfo& dst,
          const std::string& opName,
          const py::args& args,
          const py::kwargs& kwargs) {
@@ -99,8 +124,8 @@ PyObject* rpc_init(PyObject* /* unused */) {
   module.def(
       "invoke_rpc_python_udf",
       [](RpcAgent& agent,
-         const WorkerId& dst,
-         const std::string& pickledPythonUDF,
+         const WorkerInfo& dst,
+         std::string& pickledPythonUDF,
          std::vector<torch::Tensor>& tensors) {
         return pyRpcPythonUdf(agent, dst, pickledPythonUDF, tensors);
       });
@@ -108,13 +133,22 @@ PyObject* rpc_init(PyObject* /* unused */) {
   module.def(
       "invoke_remote_builtin",
       [](RpcAgent& agent,
-         const WorkerId& dst,
+         const WorkerInfo& dst,
          const std::string& opName,
          const py::args& args,
          const py::kwargs& kwargs) {
         return pyRemoteBuiltin(agent, dst, opName, args, kwargs);
       });
 
+  module.def(
+      "invoke_remote_python_udf",
+      [](RpcAgent& agent,
+         const WorkerInfo& dst,
+         std::string& pickledPythonUDF,
+         std::vector<torch::Tensor>& tensors) {
+        return pyRemotePythonUdf(agent, dst, pickledPythonUDF, tensors);
+      });
+
   Py_RETURN_TRUE;
 }
 
diff --git a/torch/csrc/distributed/rpc/message.cpp b/torch/csrc/distributed/rpc/message.cpp
index f518421565053..18a68b1eb9043 100644
--- a/torch/csrc/distributed/rpc/message.cpp
+++ b/torch/csrc/distributed/rpc/message.cpp
@@ -65,24 +65,28 @@ const MessageType& Message::type() const {
 }
 
 bool Message::isRequest() const {
-  return MessageType::SCRIPT_CALL == type_ ||
-      MessageType::PYTHON_CALL == type_ || MessageType::REMOTE_CALL == type_ ||
-      MessageType::MESSAGE_WITH_AUTOGRAD_REQ == type_ ||
-      MessageType::RREF_FETCH_CALL == type_ ||
-      MessageType::RREF_USER_CREATE == type_ ||
-      MessageType::RREF_USER_DELETE == type_;
-}
-
-bool Message::requiresResponse() const {
-  return MessageType::SCRIPT_CALL == type_ ||
-      MessageType::PYTHON_CALL == type_ ||
-      MessageType::MESSAGE_WITH_AUTOGRAD_REQ == type_ ||
-      MessageType::RREF_FETCH_CALL == type_;
+  return MessageType::SCRIPT_CALL == type_ || // dist.rpc on builtin ops
+      MessageType::PYTHON_CALL == type_ || // dist.rpc on Python UDFs
+      MessageType::SCRIPT_REMOTE_CALL == type_ || // dist.remote on builtin ops
+      MessageType::PYTHON_REMOTE_CALL == type_ || // dist.remote on Python UDFs
+      // RRef related internal messages
+      MessageType::SCRIPT_RREF_FETCH_CALL == type_ ||
+      MessageType::PYTHON_RREF_FETCH_CALL == type_ ||
+      MessageType::RREF_USER_DELETE == type_ ||
+      MessageType::RREF_CHILD_ACCEPT == type_ ||
+      MessageType::RREF_FORK_REQUEST == type_ ||
+      // Autograd message
+      MessageType::MESSAGE_WITH_AUTOGRAD_REQ == type_;
 }
 
 bool Message::isResponse() const {
-  return MessageType::SCRIPT_RET == type_ || MessageType::PYTHON_RET == type_ ||
-      MessageType::RREF_FETCH_RET == type_ ||
+  return MessageType::SCRIPT_RET == type_ || // ret of dist.rpc on builtin ops
+      MessageType::PYTHON_RET == type_ || // ret of dist.rpc on Python UDFs
+      MessageType::REMOTE_RET == type_ || // ret of dist.remote
+      MessageType::RREF_FETCH_RET == type_ || // ret on RRef::toHere()
+      MessageType::EXCEPTION == type_ || // propagate back exceptions
+      MessageType::RREF_ACK == type_ || // ret of other types
+      // Autograd response
       MessageType::MESSAGE_WITH_AUTOGRAD_RESP == type_;
 }
 
diff --git a/torch/csrc/distributed/rpc/message.h b/torch/csrc/distributed/rpc/message.h
index 76fbd9f805c6a..f9d1e7c866b74 100644
--- a/torch/csrc/distributed/rpc/message.h
+++ b/torch/csrc/distributed/rpc/message.h
@@ -8,20 +8,36 @@ namespace distributed {
 namespace rpc {
 
 enum MessageType {
+  // messages for dist.rpc on builtin operators
   SCRIPT_CALL = 0,
-  SCRIPT_RET,
-  PYTHON_CALL,
-  PYTHON_RET,
-  REMOTE_CALL,
-  RREF_FETCH_CALL,
-  RREF_FETCH_RET,
-  RREF_USER_CREATE,
-  RREF_USER_DELETE,
-  MESSAGE_WITH_AUTOGRAD_REQ,
-  MESSAGE_WITH_AUTOGRAD_RESP,
-  SHUTDOWN,
-  EXCEPTION,
-  UNKNOWN
+  SCRIPT_RET = 1,
+
+  // messages for dist.rpc on Python UDF
+  PYTHON_CALL = 2,
+  PYTHON_RET = 3,
+
+  // messages for dist.remote on builtin operators and Python UDF
+  SCRIPT_REMOTE_CALL = 4, // A remote call on a builtin operator
+  PYTHON_REMOTE_CALL = 5, // A remote call on a Python UDF
+  REMOTE_RET = 6, // A remote call on a Python UDF
+
+  // RRef related internal messages
+  SCRIPT_RREF_FETCH_CALL = 7, // A UserRRef<IValue> fetches value from owner
+  PYTHON_RREF_FETCH_CALL = 8, // A UserRRef<py::object> fetches value from owner
+  RREF_FETCH_RET = 9, // An OwnerRRef sends value to user
+  RREF_USER_DELETE = 10, // A UserRRef tells the owner to deref
+  RREF_FORK_REQUEST = 11, // A child UserRRef tells the owner about itself
+  RREF_CHILD_ACCEPT = 12, // A child UserRRef tells parent that owner knows it
+  RREF_ACK = 13, // ACK to internal RRef messages
+
+  // Messages with autograd info
+  MESSAGE_WITH_AUTOGRAD_REQ = 14,
+  MESSAGE_WITH_AUTOGRAD_RESP = 15,
+
+  // Other internal message types
+  SHUTDOWN = 16,
+  EXCEPTION = 17,
+  UNKNOWN = 18
 };
 
 // A message to be sent/received by an RpcAgent.
@@ -73,7 +89,6 @@ class TORCH_API Message final {
   const MessageType& type() const;
 
   bool isRequest() const;
-  bool requiresResponse() const;
   bool isResponse() const;
   bool isShutdown() const;
 
diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp
index 0609fa2b08a31..37b1651536c69 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.cpp
+++ b/torch/csrc/distributed/rpc/process_group_agent.cpp
@@ -74,18 +74,18 @@ std::vector<int64_t> ProcessGroupAgent::MessageCounter::snapshot() {
 ////////////////////////  ProcessGroupAgent  /////////////////////////////////
 
 void ProcessGroupAgent::collectNames() {
-  const std::string& workerName = workerId_.name_;
+  const std::string& workerName = workerInfo_.name_;
   const auto worldSize = pg_->getSize();
 
   // use c10d allgather to collect names
   torch::Tensor nameTensor =
-      torch::zeros({WorkerId::MAX_NAME_LEN}, torch::kChar);
+      torch::zeros({WorkerInfo::MAX_NAME_LEN}, torch::kChar);
   memcpy(nameTensor.storage().data(), workerName.c_str(), workerName.length());
   std::vector<torch::Tensor> inputName = {nameTensor};
   std::vector<std::vector<torch::Tensor>> outputNames(1);
   for (int i = 0; i < worldSize; ++i) {
     outputNames[0].emplace_back(
-        torch::empty({WorkerId::MAX_NAME_LEN}, {torch::kChar}));
+        torch::empty({WorkerInfo::MAX_NAME_LEN}, {torch::kChar}));
   }
   pg_->allgather(outputNames, inputName)->wait();
 
@@ -109,7 +109,7 @@ ProcessGroupAgent::ProcessGroupAgent(
     std::shared_ptr<c10d::ProcessGroup> pg,
     int numSendRecvThreads)
     : RpcAgent(
-          WorkerId(std::move(workerName), pg->getRank()),
+          WorkerInfo(std::move(workerName), pg->getRank()),
           c10::guts::make_unique<RequestCallbackImpl>()),
       pg_(std::move(pg)),
       sendCounts_(pg_->getSize()),
@@ -123,12 +123,12 @@ ProcessGroupAgent::ProcessGroupAgent(
       "ProcessGroupAgent requires world_size to "
       "be at least 2, but got ",
       nameMap_.size());
-  auto workerRankIter = nameMap_.find(workerId_.name_);
+  auto workerRankIter = nameMap_.find(workerInfo_.name_);
   TORCH_CHECK(
       workerRankIter != nameMap_.end(),
       "Failed to resolve worker "
       "name ",
-      workerId_.name_,
+      workerInfo_.name_,
       " to a ProcessGroup rank.");
   TORCH_CHECK(
       pg_->getRank() == workerRankIter->second,
@@ -143,9 +143,9 @@ ProcessGroupAgent::ProcessGroupAgent(
     tmpWorkerIds[entry.second] = entry.first;
   }
 
-  workerIds_.reserve(pg_->getSize());
+  allWorkerInfo_.reserve(pg_->getSize());
   for (int rank = 0; rank < (int)tmpWorkerIds.size(); ++rank) {
-    workerIds_.emplace_back(std::move(tmpWorkerIds[rank]), rank);
+    allWorkerInfo_.emplace_back(std::move(tmpWorkerIds[rank]), rank);
   }
 
   // construct PythonRpcHandler singleton here
@@ -153,17 +153,17 @@ ProcessGroupAgent::ProcessGroupAgent(
   listenerThread_ = std::thread(&ProcessGroupAgent::listenLoop, this);
 }
 
-const WorkerId& ProcessGroupAgent::getWorkerId(
+const WorkerInfo& ProcessGroupAgent::getWorkerInfo(
     const std::string& workerName) const {
   const auto idIter = nameMap_.find(workerName);
   TORCH_CHECK(
       idIter != nameMap_.end(), "Unknown destination worker ", workerName);
 
-  return workerIds_[idIter->second];
+  return allWorkerInfo_[idIter->second];
 }
 
-const WorkerId& ProcessGroupAgent::getWorkerId(worker_id_t id) const {
-  return workerIds_[id];
+const WorkerInfo& ProcessGroupAgent::getWorkerInfo(worker_id_t id) const {
+  return allWorkerInfo_[id];
 }
 
 void ProcessGroupAgent::join() {
@@ -174,9 +174,13 @@ void ProcessGroupAgent::join() {
   // 2. A GLOO process cannot send message to itself. (there is an ongoing
   //    effort to fix this problem).
   sync();
+  std::unique_lock<std::mutex> lock(futureMutex_);
+  futureCV_.wait(lock, [this] { return futures_.empty(); });
+  lock.unlock();
+  pg_->barrier()->wait();
   int dst = (pg_->getRank() + 1) % pg_->getSize();
   enqueueSend(
-      SendWork(workerIds_[dst], Message({}, {}, MessageType::SHUTDOWN)));
+      SendWork(allWorkerInfo_[dst], Message({}, {}, MessageType::SHUTDOWN)));
   threadPool_.waitWorkComplete();
   listenerThread_.join();
 }
@@ -198,7 +202,6 @@ bool ProcessGroupAgent::hasPendingMessage() {
 
   std::vector<torch::Tensor> inputSnapshot = {
       torch::from_blob(snapshot.data(), {2, worldSize}, {torch::kInt64})};
-
   // allgather both send and recv messages in one shot
   std::vector<std::vector<torch::Tensor>> outputSnapshots(1);
 
@@ -245,7 +248,7 @@ void ProcessGroupAgent::sync() {
 }
 
 std::shared_ptr<FutureMessage> ProcessGroupAgent::send(
-    const WorkerId& to,
+    const WorkerInfo& to,
     Message&& message) {
   TORCH_CHECK(
       to.id_ != (worker_id_t)pg_->getRank(),
@@ -259,7 +262,7 @@ std::shared_ptr<FutureMessage> ProcessGroupAgent::send(
 
   auto requestId = nextId();
   auto future = std::make_shared<FutureMessage>();
-  if (message.requiresResponse()) {
+  if (message.isRequest()) {
     {
       std::lock_guard<std::mutex> lock{futureMutex_};
       futures_[requestId] = future;
@@ -271,14 +274,14 @@ std::shared_ptr<FutureMessage> ProcessGroupAgent::send(
 
   // NB: cannot directly pass ``to`` to the ``SendWork``, because it might no
   // longer be alive when the ``SendWork`` is executed. For example, the
-  // application could query the ``WorkerId`` using name through the
-  // ``RpcAgent::getWorkerId`` API, and pass the ``WorkerId`` back here, so we
-  // have C++ -> Python -> C++. For an asynchronous RPC, the ``WorkerId``
+  // application could query the ``WorkerInfo`` using name through the
+  // ``RpcAgent::getWorkerInfo`` API, and pass the ``WorkerInfo`` back here, so
+  // we have C++ -> Python -> C++. For an asynchronous RPC, the ``WorkerInfo``
   // reference on Python side could die before ``SendWork`` uses it, and Pybind
   // will not keep the Python reference alive even if it originally comes from
-  // the C++ land. Hence, we have to explicitly use the ``workerId`` in the C++
-  // land.
-  enqueueSend(SendWork(workerIds_[to.id_], std::move(message)));
+  // the C++ land. Hence, we have to explicitly use the ``WorkerInfo`` in the
+  // C++ land.
+  enqueueSend(SendWork(allWorkerInfo_[to.id_], std::move(message)));
   return future;
 }
 
@@ -340,18 +343,23 @@ void ProcessGroupAgent::enqueueRecv(RecvWork work) {
             (char*)payload.storage().data<signed char>(), payload.numel()));
 
         Message message = deserialize(work.type_, ss);
-        if (message.requiresResponse()) {
-          auto response = cb_->operator()(message);
-          send(work.from_, std::move(response));
-        } else if (message.isRequest()) {
-          cb_->operator()(message);
+        if (message.isRequest()) {
+          send(work.from_, cb_->operator()(message));
         } else if (message.isResponse()) {
           auto id = message.id();
+          std::shared_ptr<FutureMessage> fm = nullptr;
+          {
+            std::lock_guard<std::mutex> lock{futureMutex_};
+            fm = futures_[id];
+          }
+          // Not holding lock on markCompleted as this could run callbacks that
+          // call agent_->send
+          fm->markCompleted(std::move(message));
           {
             std::lock_guard<std::mutex> lock{futureMutex_};
-            futures_[id]->markCompleted(std::move(message));
             futures_.erase(id);
           }
+          futureCV_.notify_all();
         } else {
           // TODO: pass the error back to the caller instead of crashing here.
           TORCH_INTERNAL_ASSERT(
@@ -378,7 +386,7 @@ void ProcessGroupAgent::listenLoop() {
       // FIXME: This LOG also prints warnings no InitGoogleLogging() was invoked
       // before logging, but it is not appropriate to call InitGoogleLogging()
       // here either.
-      LOG(INFO) << "Shutting down ProcessGroupAgent " << workerId_.name_
+      LOG(INFO) << "Shutting down ProcessGroupAgent " << workerInfo_.name_
                 << std::endl;
       return;
     }
@@ -386,7 +394,7 @@ void ProcessGroupAgent::listenLoop() {
     std::vector<torch::Tensor> tensors = {torch::empty({size}, {torch::kChar})};
     pg_->recv(tensors, srcRank, pg_->getRank())->wait();
 
-    enqueueRecv(RecvWork(workerIds_[srcRank], type, std::move(tensors[0])));
+    enqueueRecv(RecvWork(allWorkerInfo_[srcRank], type, std::move(tensors[0])));
   }
 }
 
diff --git a/torch/csrc/distributed/rpc/process_group_agent.h b/torch/csrc/distributed/rpc/process_group_agent.h
index bf192c8eb68da..a83cb5eaa8339 100644
--- a/torch/csrc/distributed/rpc/process_group_agent.h
+++ b/torch/csrc/distributed/rpc/process_group_agent.h
@@ -15,20 +15,20 @@ namespace rpc {
 // SendWork and RecvWork will be put into a task queue, and later picked up by
 // worker threads from the same ThreadPool.
 struct SendWork {
-  SendWork(const WorkerId& to, Message&& message)
+  SendWork(const WorkerInfo& to, Message&& message)
       : to_(to), message_(message) {}
 
-  const WorkerId& to_;
+  const WorkerInfo& to_;
   Message message_;
 };
 
 // SendWork wraps a Message and RecvWork wraps a Tensor. The difference here is
 // to allow us to run serialization/deserialization in the worker threads.
 struct RecvWork {
-  RecvWork(const WorkerId& from, MessageType type, torch::Tensor&& payload)
+  RecvWork(const WorkerInfo& from, MessageType type, torch::Tensor&& payload)
       : from_(from), type_(type), payload_(payload) {}
 
-  const WorkerId& from_;
+  const WorkerInfo& from_;
   const MessageType type_;
   torch::Tensor payload_;
 };
@@ -40,9 +40,9 @@ class ProcessGroupAgent : public RpcAgent {
       std::shared_ptr<c10d::ProcessGroup> pg,
       int numSendRecvThreads = 4);
 
-  const WorkerId& getWorkerId(const std::string& workerName) const override;
+  const WorkerInfo& getWorkerInfo(const std::string& workerName) const override;
 
-  const WorkerId& getWorkerId(worker_id_t id) const override;
+  const WorkerInfo& getWorkerInfo(worker_id_t id) const override;
 
   void join() override;
 
@@ -52,7 +52,7 @@ class ProcessGroupAgent : public RpcAgent {
   // This method wraps the destination information and the message into a
   // SendWork object, and put the SendWork into a queue. Another thread will
   // consume SendWork from the queue and send it out.
-  std::shared_ptr<FutureMessage> send(const WorkerId& to, Message&& message)
+  std::shared_ptr<FutureMessage> send(const WorkerInfo& to, Message&& message)
       override;
 
  private:
@@ -99,13 +99,13 @@ class ProcessGroupAgent : public RpcAgent {
   bool hasPendingMessage();
 
   int64_t nextId() {
-    return nextId_++;
+    return ++nextId_;
   }
 
   std::shared_ptr<c10d::ProcessGroup> pg_;
   // worker name -> rank
   std::unordered_map<std::string, int> nameMap_;
-  std::vector<WorkerId> workerIds_;
+  std::vector<WorkerInfo> allWorkerInfo_;
   // record the number of messages sent to and received from each peer. The recv
   // counter is only marked after the message is processed. Join uses allgather
   // to collect all counts from all peers, uses these counters to detect global
@@ -129,7 +129,8 @@ class ProcessGroupAgent : public RpcAgent {
   //         This is just a temporary solution for (2).
   ThreadPool threadPool_;
   std::unordered_map<int64_t, std::shared_ptr<FutureMessage>> futures_;
-  std::mutex futureMutex_;
+  mutable std::mutex futureMutex_;
+  mutable std::condition_variable futureCV_;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/py_rref.cpp b/torch/csrc/distributed/rpc/py_rref.cpp
new file mode 100644
index 0000000000000..54168bf73f81a
--- /dev/null
+++ b/torch/csrc/distributed/rpc/py_rref.cpp
@@ -0,0 +1,135 @@
+#include <torch/csrc/distributed/rpc/py_rref.h>
+
+#include <torch/csrc/distributed/rpc/rref_context.h>
+#include <torch/csrc/jit/pybind_utils.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+namespace {
+
+// Constants below are used in PyRRef pickling and unpickling. PyRRef is
+// converted into a py::tuple in pickling, and reconstructed from the py::tuple
+// in unpickling.
+constexpr int RFD_IDX = 0; // index of RRefForkData
+constexpr int TYPE_IDX = 1; // index of type (py::object or IValue)
+
+// number of data fields in the py::tuple.
+// NB: if more fields are added, make sure this field is also bumped
+constexpr int RREF_TUPLE_SIZE = 2;
+
+} // namespace
+
+PyRRef::PyRRef(std::shared_ptr<RRef> rref) : rref_(std::move(rref)) {
+  TORCH_CHECK(rref_, "PyRRef must not wrap nullptr");
+}
+
+bool PyRRef::isOwner() const {
+  return rref_->isOwner();
+}
+
+worker_id_t PyRRef::owner() const {
+  return rref_->owner();
+}
+
+py::object PyRRef::toHere() {
+  if (rref_->isOwner()) {
+    if (rref_->isPyObj()) {
+      const py::object& value =
+          std::static_pointer_cast<OwnerRRef<py::object>>(rref_)->getValue();
+
+      {
+        // acquiring GIL as the return statement construct a new py::object from
+        // a const reference.
+        AutoGIL ag;
+        return value;
+      }
+    } else {
+      IValue value =
+          std::static_pointer_cast<OwnerRRef<IValue>>(rref_)->getValue();
+
+      {
+        // acquiring GIL as torch::jit::toPyObject creates new py::object
+        // without grabbing the GIL.
+        AutoGIL ag;
+        return torch::jit::toPyObject(std::move(value));
+      }
+    }
+  } else {
+    if (rref_->isPyObj()) {
+      // UserRRef<py::object>::toHere() calls python_rpc_handler which acquires
+      // GIL.
+      return std::static_pointer_cast<UserRRef<py::object>>(rref_)->toHere();
+    } else {
+      IValue value =
+          std::static_pointer_cast<UserRRef<IValue>>(rref_)->toHere();
+
+      {
+        // acquiring GIL as torch::jit::toPyObject creates new py::object
+        // without grabbing the GIL.
+        AutoGIL ag;
+        return torch::jit::toPyObject(std::move(value));
+      }
+    }
+  }
+}
+
+py::object PyRRef::localValue() {
+  TORCH_CHECK(
+      rref_->isOwner(),
+      "Cannot call localValue() on a non-local reference. Call it on ",
+      RRefContext::getInstance()->getWorkerName());
+
+  if (rref_->isPyObj()) {
+    const py::object& value =
+        std::dynamic_pointer_cast<OwnerRRef<py::object>>(rref_)->getValue();
+
+    {
+      // acquiring GIL as the return statement construct a new py::object from
+      // a const reference.
+      AutoGIL ag;
+      return value;
+    }
+  } else {
+    auto value =
+        std::dynamic_pointer_cast<OwnerRRef<IValue>>(rref_)->getValue();
+    {
+      // acquiring GIL as torch::jit::toPyObject creates new py::object without
+      // grabbing the GIL.
+      AutoGIL ag;
+      return torch::jit::toPyObject(std::move(value));
+    }
+  }
+}
+
+py::tuple PyRRef::pickle() const {
+  auto& ctx = RRefContext::getInstance();
+  // TODO: use a dispatch table to pickle/unpickle an RRef, and only only
+  // install the dispatch table only when there are indeed RPC activities. As
+  // a counter example, checkpointing a model with RRefs should not trigger
+  // forks to be added as a fork or a child.
+  auto rfd = ctx->prepareChildFork(rref_);
+  return py::make_tuple(rfd.toPyTuple(), rref_->isPyObj());
+}
+
+PyRRef PyRRef::unpickle(const py::tuple& t) {
+  TORCH_INTERNAL_ASSERT(
+      t.size() == RREF_TUPLE_SIZE, "Pickled RRef must contain 2 numbers.");
+  auto& ctx = RRefContext::getInstance();
+  auto rfd = RRefForkData::fromPyTuple(t[RFD_IDX].cast<py::tuple>());
+  std::shared_ptr<RRef> rref = nullptr;
+  bool isPyObj = t[TYPE_IDX].cast<bool>();
+  if (isPyObj) {
+    rref = ctx->getOrCreateRRef<py::object>(rfd);
+  } else {
+    rref = ctx->getOrCreateRRef<IValue>(rfd);
+  }
+
+  ctx->notifyOwnerAndParentOfFork(rfd.forkId_, rfd.parent_, rref);
+  return PyRRef(std::move(rref));
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/py_rref.h b/torch/csrc/distributed/rpc/py_rref.h
new file mode 100644
index 0000000000000..10c5841490765
--- /dev/null
+++ b/torch/csrc/distributed/rpc/py_rref.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/rref.h>
+#include <torch/csrc/python_headers.h>
+#include <torch/csrc/utils/pybind.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Python wrapper of an RRef shared_ptr that supports Python
+// pickle and unpickle.
+class PyRRef {
+ public:
+  explicit PyRRef(std::shared_ptr<RRef> rref);
+
+  bool isOwner() const;
+  worker_id_t owner() const;
+  py::object toHere();
+  py::object localValue();
+  py::tuple pickle() const;
+  static PyRRef unpickle(const py::tuple& t);
+
+ private:
+  std::shared_ptr<RRef> rref_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_functions.cpp b/torch/csrc/distributed/rpc/python_functions.cpp
index 1b9e4e62ef757..52cae25206d91 100644
--- a/torch/csrc/distributed/rpc/python_functions.cpp
+++ b/torch/csrc/distributed/rpc/python_functions.cpp
@@ -6,6 +6,18 @@
 #include <torch/csrc/distributed/rpc/python_udf_resp.h>
 #include <torch/csrc/distributed/rpc/utils.h>
 
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/python_remote_call.h>
+#include <torch/csrc/distributed/rpc/python_rpc_handler.h>
+#include <torch/csrc/distributed/rpc/rref.h>
+#include <torch/csrc/distributed/rpc/rref_context.h>
+#include <torch/csrc/distributed/rpc/rref_proto.h>
+#include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/script_remote_call.h>
+#include <torch/csrc/distributed/rpc/script_resp.h>
+
+#include <torch/csrc/jit/pybind_utils.h>
+
 namespace torch {
 namespace distributed {
 namespace rpc {
@@ -47,8 +59,13 @@ std::shared_ptr<Operator> matchBuiltinOp(
       ", kwargs: ",
       kwargs,
       ") to a builtin operator");
+}
 
-  // builtin operators.
+void finishAcceptUserRRef(const Message& message) {
+  RRefContext::handleException(message);
+  auto rr = RemoteRet::fromMessage(message);
+  auto& ctx = RRefContext::getInstance();
+  ctx->delPendingUser(rr->forkId());
 }
 
 } // namespace
@@ -71,6 +88,7 @@ py::object toPyObjInternal(RpcCommandBase& rpc, MessageType messageType) {
     case MessageType::PYTHON_RET: {
       // TODO: Try to avoid a copy here.
       auto& resp = static_cast<PythonUDFResp&>(rpc);
+
       return PythonRpcHandler::getInstance().loadPythonUDFResult(
           resp.pickledPayload(), resp.tensors());
     }
@@ -97,7 +115,7 @@ py::object toPyObj(const Message& message) {
 
 std::shared_ptr<FutureMessage> pyRpcBuiltin(
     RpcAgent& agent,
-    const WorkerId& dst,
+    const WorkerInfo& dst,
     const std::string& opName,
     const py::args& args,
     const py::kwargs& kwargs) {
@@ -127,9 +145,9 @@ std::shared_ptr<FutureMessage> pyRpcBuiltin(
   }
 }
 
-std::shared_ptr<RRef> pyRemoteBuiltin(
+PyRRef pyRemoteBuiltin(
     RpcAgent& agent,
-    const WorkerId& dst,
+    const WorkerInfo& dst,
     const std::string& opName,
     const py::args& args,
     const py::kwargs& kwargs) {
@@ -137,22 +155,26 @@ std::shared_ptr<RRef> pyRemoteBuiltin(
   auto op = matchBuiltinOp(opName, args, kwargs, stack);
 
   auto& ctx = RRefContext::getInstance();
-  auto userRRef = ctx->createUserRRef(dst.id_);
-  agent.send(
+  // TODO: support creating RRefs on a local object.
+  TORCH_INTERNAL_ASSERT(
+      ctx->getWorkerId() != dst.id_,
+      "Does not support creating RRef on self yet.");
+  auto userRRef = ctx->createUserRRef<IValue>(dst.id_);
+  auto fm = agent.send(
       dst,
       ScriptRemoteCall(
-          op,
-          std::move(stack),
-          userRRef->id().toIValue(),
-          userRRef->forkId().toIValue())
+          op, std::move(stack), userRRef->rrefId(), userRRef->forkId())
           .toMessage());
-  return userRRef;
+
+  ctx->addPendingUser(userRRef->forkId(), userRRef);
+  fm->addCallback(finishAcceptUserRRef);
+  return PyRRef(userRRef);
 }
 
 std::shared_ptr<FutureMessage> pyRpcPythonUdf(
     RpcAgent& agent,
-    const WorkerId& dst,
-    const std::string& pickledPythonUDF,
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
     std::vector<torch::Tensor>& tensors) {
   return agent.send(
       dst,
@@ -162,6 +184,30 @@ std::shared_ptr<FutureMessage> pyRpcPythonUdf(
           .toMessage());
 }
 
+PyRRef pyRemotePythonUdf(
+    RpcAgent& agent,
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
+    std::vector<torch::Tensor>& tensors) {
+  auto& ctx = RRefContext::getInstance();
+  // TODO: support creating RRefs on a local object.
+  TORCH_INTERNAL_ASSERT(
+      ctx->getWorkerId() != dst.id_,
+      "Does not support creating RRef on self yet.");
+  auto userRRef = ctx->createUserRRef<py::object>(dst.id_);
+  auto fm = agent.send(
+      dst,
+      PythonRemoteCall(
+          SerializedPyObj(std::move(pickledPythonUDF), std::move(tensors)),
+          userRRef->rrefId().toIValue(),
+          userRRef->forkId().toIValue())
+          .toMessage());
+
+  ctx->addPendingUser(userRRef->forkId(), userRRef);
+  fm->addCallback(finishAcceptUserRRef);
+  return PyRRef(userRRef);
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_functions.h b/torch/csrc/distributed/rpc/python_functions.h
index f3a1ffda07bde..c8f55e35d2cb2 100644
--- a/torch/csrc/distributed/rpc/python_functions.h
+++ b/torch/csrc/distributed/rpc/python_functions.h
@@ -1,15 +1,8 @@
 #pragma once
 
 #include <torch/csrc/distributed/rpc/future_message.h>
-#include <torch/csrc/distributed/rpc/message.h>
-#include <torch/csrc/distributed/rpc/python_rpc_handler.h>
+#include <torch/csrc/distributed/rpc/py_rref.h>
 #include <torch/csrc/distributed/rpc/rpc_agent.h>
-#include <torch/csrc/distributed/rpc/rref.h>
-#include <torch/csrc/distributed/rpc/rref_context.h>
-#include <torch/csrc/distributed/rpc/script_call.h>
-#include <torch/csrc/distributed/rpc/script_remote_call.h>
-#include <torch/csrc/distributed/rpc/script_resp.h>
-#include <torch/csrc/jit/pybind_utils.h>
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch {
@@ -20,24 +13,30 @@ py::object toPyObj(const Message& message);
 
 std::shared_ptr<FutureMessage> pyRpcBuiltin(
     RpcAgent& agent,
-    const WorkerId& dst,
+    const WorkerInfo& dst,
     const std::string& opName,
     const py::args& args,
     const py::kwargs& kwargs);
 
 std::shared_ptr<FutureMessage> pyRpcPythonUdf(
     RpcAgent& agent,
-    const WorkerId& dst,
-    const std::string& pickledPythonUDF,
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
     std::vector<torch::Tensor>& tensors);
 
-std::shared_ptr<RRef> pyRemoteBuiltin(
+PyRRef pyRemoteBuiltin(
     RpcAgent& agent,
-    const WorkerId& dst,
+    const WorkerInfo& dst,
     const std::string& opName,
     const py::args& args,
     const py::kwargs& kwargs);
 
+PyRRef pyRemotePythonUdf(
+    RpcAgent& agent,
+    const WorkerInfo& dst,
+    std::string& pickledPythonUDF,
+    std::vector<torch::Tensor>& tensors);
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_remote_call.cpp b/torch/csrc/distributed/rpc/python_remote_call.cpp
new file mode 100644
index 0000000000000..cdd307c3c4e08
--- /dev/null
+++ b/torch/csrc/distributed/rpc/python_remote_call.cpp
@@ -0,0 +1,53 @@
+#include <torch/csrc/distributed/rpc/python_remote_call.h>
+#include <torch/csrc/jit/pickle.h>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+PythonRemoteCall::PythonRemoteCall(
+    SerializedPyObj&& serializedPyObj,
+    at::IValue retRRefId,
+    at::IValue retForkId)
+    : serializedPyObj_(std::move(serializedPyObj)),
+      retRRefId_(std::move(retRRefId)),
+      retForkId_(std::move(retForkId)) {}
+
+Message PythonRemoteCall::toMessage() && {
+  std::vector<IValue> ivalues = serializedPyObj_.toIValues();
+  ivalues.emplace_back(retRRefId_);
+  ivalues.emplace_back(retForkId_);
+
+  std::vector<torch::Tensor> tensor_table;
+  auto payload =
+      jit::pickle(c10::ivalue::Tuple::create(ivalues), &tensor_table);
+
+  return Message(
+      std::move(payload),
+      std::move(tensor_table),
+      MessageType::PYTHON_REMOTE_CALL);
+}
+
+std::unique_ptr<PythonRemoteCall> PythonRemoteCall::fromMessage(
+    const Message& message) {
+  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload_size = message.payload().size();
+
+  auto value =
+      jit::unpickle(payload, payload_size, nullptr, &message.tensors());
+  auto values = value.toTuple()->elements();
+
+  // remove the last element from values and convert it back to an RRef
+  auto retForkId = std::move(values.back());
+  values.pop_back();
+  auto retRRefId = std::move(values.back());
+  values.pop_back();
+  auto serializedPyObj = SerializedPyObj::fromIValues(std::move(values));
+
+  return c10::guts::make_unique<PythonRemoteCall>(
+      std::move(serializedPyObj), std::move(retRRefId), std::move(retForkId));
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_remote_call.h b/torch/csrc/distributed/rpc/python_remote_call.h
new file mode 100644
index 0000000000000..fbf255e20a762
--- /dev/null
+++ b/torch/csrc/distributed/rpc/python_remote_call.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/pickler.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+class TORCH_API PythonRemoteCall : public RpcCommandBase {
+ public:
+  PythonRemoteCall(
+      SerializedPyObj&& serializedPyObj,
+      at::IValue retRRefId,
+      at::IValue retForkId);
+
+  inline const SerializedPyObj& serializedPyObj() const {
+    return serializedPyObj_;
+  }
+
+  inline const at::IValue& retRRefId() const {
+    return retRRefId_;
+  }
+
+  inline const at::IValue& retForkId() const {
+    return retForkId_;
+  }
+
+  Message toMessage() && override;
+  static std::unique_ptr<PythonRemoteCall> fromMessage(const Message& message);
+
+ private:
+  const SerializedPyObj serializedPyObj_;
+  const at::IValue retRRefId_;
+  const at::IValue retForkId_;
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.cpp b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
index b9087e52119f6..61775c24eeac2 100644
--- a/torch/csrc/distributed/rpc/python_rpc_handler.cpp
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
@@ -10,6 +10,7 @@ PythonRpcHandler::PythonRpcHandler() {
       py::module::import("torch.distributed.internal_rpc_utils");
   runUDFFunction_ = module.attr("run_python_udf_internal");
   loadResultFunction_ = module.attr("load_python_udf_result_internal");
+  serializeFunction_ = module.attr("serialize");
 }
 
 PythonRpcHandler& PythonRpcHandler::getInstance() {
@@ -24,7 +25,8 @@ std::vector<char> PythonRpcHandler::generatePythonUDFResult(
   AutoGIL ag;
   auto pargs = py::bytes(pickledPayload.data(), pickledPayload.size());
   TORCH_CHECK(runUDFFunction_ != nullptr, "runUDFFunction_ is nullptr");
-  py::tuple pres = runUDFFunction_(pargs, requestTensorTable);
+  py::tuple pres =
+      serializeFunction_(runUDFFunction_(pargs, requestTensorTable));
   const auto& presStr = pres[0].cast<std::string>();
   responseTensorTable = pres[1].cast<std::vector<torch::Tensor>>();
   std::vector<char> payload(presStr.begin(), presStr.end());
@@ -40,6 +42,26 @@ py::object PythonRpcHandler::loadPythonUDFResult(
   return loadResultFunction_(pargs, tensorTable);
 }
 
+py::object PythonRpcHandler::runPythonUDF(
+    const SerializedPyObj& serializedObj) {
+  AutoGIL ag;
+  return runUDFFunction_(
+      py::bytes(serializedObj.payload_), serializedObj.tensors_);
+}
+
+SerializedPyObj PythonRpcHandler::serialize(const py::object& obj) {
+  AutoGIL ag;
+  py::tuple t = serializeFunction_(obj);
+  return SerializedPyObj(
+      t[0].cast<std::string>(), t[1].cast<std::vector<torch::Tensor>>());
+}
+
+py::object PythonRpcHandler::deserialize(const SerializedPyObj& serializedObj) {
+  AutoGIL ag;
+  return loadResultFunction_(
+      py::bytes(serializedObj.payload_), serializedObj.tensors_);
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/python_rpc_handler.h b/torch/csrc/distributed/rpc/python_rpc_handler.h
index 2fd51b8942dc2..ce551153bc102 100644
--- a/torch/csrc/distributed/rpc/python_rpc_handler.h
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/types.h>
 #include <torch/csrc/utils/pybind.h>
 
 namespace torch {
@@ -26,6 +27,12 @@ class PYBIND11_EXPORT PythonRpcHandler {
   py::object loadPythonUDFResult(
       const std::vector<char>& pickledPayload,
       const std::vector<torch::Tensor>& tensorTable);
+  // Run a pickled Python UDF and return the result py::object
+  py::object runPythonUDF(const SerializedPyObj& serializedObj);
+  // Serialized a py::object into a string
+  SerializedPyObj serialize(const py::object& obj);
+  // Deserialize a string into a py::object
+  py::object deserialize(const SerializedPyObj& serializedObj);
 
  private:
   PythonRpcHandler();
@@ -38,6 +45,7 @@ class PYBIND11_EXPORT PythonRpcHandler {
 
   py::object runUDFFunction_;
   py::object loadResultFunction_;
+  py::object serializeFunction_;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/request_callback_impl.cpp b/torch/csrc/distributed/rpc/request_callback_impl.cpp
index d9eeabec64c87..c2221759f3e73 100644
--- a/torch/csrc/distributed/rpc/request_callback_impl.cpp
+++ b/torch/csrc/distributed/rpc/request_callback_impl.cpp
@@ -4,16 +4,17 @@
 #include <torch/csrc/distributed/autograd/context/dist_autograd_context.h>
 #include <torch/csrc/distributed/autograd/utils.h>
 #include <torch/csrc/distributed/rpc/future_message.h>
+#include <torch/csrc/distributed/rpc/python_remote_call.h>
 #include <torch/csrc/distributed/rpc/python_rpc_handler.h>
 #include <torch/csrc/distributed/rpc/python_udf_call.h>
 #include <torch/csrc/distributed/rpc/python_udf_resp.h>
 #include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
 #include <torch/csrc/distributed/rpc/rref.h>
 #include <torch/csrc/distributed/rpc/rref_context.h>
+#include <torch/csrc/distributed/rpc/rref_proto.h>
 #include <torch/csrc/distributed/rpc/script_call.h>
 #include <torch/csrc/distributed/rpc/script_remote_call.h>
 #include <torch/csrc/distributed/rpc/script_resp.h>
-#include <torch/csrc/distributed/rpc/script_rref_proto.h>
 #include <torch/csrc/distributed/rpc/utils.h>
 
 namespace torch {
@@ -56,15 +57,11 @@ std::unique_ptr<RpcCommandBase> RequestCallbackImpl::processRpc(
       return c10::guts::make_unique<PythonUDFResp>(
           std::move(payload), std::move(responseTensorTable));
     }
-    case MessageType::REMOTE_CALL: {
+    case MessageType::SCRIPT_REMOTE_CALL: {
       auto& src = static_cast<ScriptRemoteCall&>(rpc);
-
-      auto rrefId = RRefId::fromIValue(src.retRRefId());
-      auto forkId = ForkId::fromIValue(src.retForkId());
-      TORCH_CHECK(rrefId != forkId, "Does not support remote call to self.");
-
       auto& ctx = RRefContext::getInstance();
-      auto ownerRRef = ctx->getOrCreateOwnerRRef<IValue>(rrefId);
+
+      auto ownerRRef = ctx->getOrCreateOwnerRRef<IValue>(src.retRRefId());
 
       // TODO: make this asynchronous
       // src is only alive within this block, use reference to avoid copy
@@ -78,25 +75,60 @@ std::unique_ptr<RpcCommandBase> RequestCallbackImpl::processRpc(
           stack.size());
 
       ownerRRef->setValue(std::move(stack.front()));
-      return nullptr;
+      ctx->addForkOfOwner(src.retRRefId(), src.retForkId());
+      return c10::guts::make_unique<RemoteRet>(
+          src.retRRefId(), src.retForkId());
+    }
+    case MessageType::PYTHON_REMOTE_CALL: {
+      auto& prc = static_cast<PythonRemoteCall&>(rpc);
+
+      auto rrefId = RRefId::fromIValue(prc.retRRefId());
+      auto forkId = ForkId::fromIValue(prc.retForkId());
+      auto& ctx = RRefContext::getInstance();
+
+      auto ownerRRef = ctx->getOrCreateOwnerRRef<py::object>(rrefId);
+      ownerRRef->setValue(
+          PythonRpcHandler::getInstance().runPythonUDF(prc.serializedPyObj()));
+      ctx->addForkOfOwner(rrefId, forkId);
+      return c10::guts::make_unique<RemoteRet>(rrefId, forkId);
     }
-    case MessageType::RREF_FETCH_CALL: {
+    case MessageType::SCRIPT_RREF_FETCH_CALL: {
       auto& srf = static_cast<ScriptRRefFetchCall&>(rpc);
+      auto& ctx = RRefContext::getInstance();
       // TODO: make this asynchronous
       std::shared_ptr<OwnerRRef<IValue>> rref =
-          RRefContext::getInstance()->getOrCreateOwnerRRef<IValue>(
-              RRefId::fromIValue(srf.value()));
-      return c10::guts::make_unique<ScriptRRefFetchRet>(rref->getValue());
+          ctx->getOrCreateOwnerRRef<IValue>(srf.rrefId());
+      return c10::guts::make_unique<RRefFetchRet>(
+          RRefFetchRet({rref->getValue()}));
     }
-    case MessageType::RREF_USER_CREATE: {
-      auto& sra = static_cast<ScriptRRefCreate&>(rpc);
-      RRefContext::getInstance()->addFork(sra.valueRef());
-      return nullptr;
+    case MessageType::PYTHON_RREF_FETCH_CALL: {
+      auto& prf = static_cast<PythonRRefFetchCall&>(rpc);
+      auto& ctx = RRefContext::getInstance();
+      // TODO: make this asynchronous
+      std::shared_ptr<OwnerRRef<py::object>> rref =
+          ctx->getOrCreateOwnerRRef<py::object>(prf.rrefId());
+      SerializedPyObj result =
+          PythonRpcHandler::getInstance().serialize(rref->getValue());
+      return c10::guts::make_unique<RRefFetchRet>(
+          RRefFetchRet(result.toIValues()));
     }
     case MessageType::RREF_USER_DELETE: {
-      auto& srd = static_cast<ScriptRRefDelete&>(rpc);
-      RRefContext::getInstance()->delFork(srd.valueRef());
-      return nullptr;
+      auto& rud = static_cast<RRefUserDelete&>(rpc);
+      auto& ctx = RRefContext::getInstance();
+      ctx->delForkOfOwner(rud.rrefId(), rud.forkId());
+      return c10::guts::make_unique<RRefAck>();
+    }
+    case MessageType::RREF_CHILD_ACCEPT: {
+      auto& rca = static_cast<RRefChildAccept&>(rpc);
+      auto& ctx = RRefContext::getInstance();
+      ctx->delPendingChild(rca.forkId());
+      return c10::guts::make_unique<RRefAck>();
+    }
+    case MessageType::RREF_FORK_REQUEST: {
+      auto& rfr = static_cast<RRefForkRequest&>(rpc);
+      auto& ctx = RRefContext::getInstance();
+      ctx->addForkOfOwner(rfr.rrefId(), rfr.forkId());
+      return c10::guts::make_unique<RRefAck>();
     }
     case MessageType::MESSAGE_WITH_AUTOGRAD_REQ: {
       auto& rpcWithAutograd = static_cast<RpcWithAutograd&>(rpc);
diff --git a/torch/csrc/distributed/rpc/rpc_agent.cpp b/torch/csrc/distributed/rpc/rpc_agent.cpp
index 4fef3fd9a315f..fb5767813fb47 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.cpp
+++ b/torch/csrc/distributed/rpc/rpc_agent.cpp
@@ -4,15 +4,15 @@ namespace torch {
 namespace distributed {
 namespace rpc {
 
-constexpr size_t WorkerId::MAX_NAME_LEN;
+constexpr size_t WorkerInfo::MAX_NAME_LEN;
 
-RpcAgent::RpcAgent(WorkerId workerId, std::unique_ptr<RequestCallback> cb)
-    : workerId_(std::move(workerId)), cb_(std::move(cb)) {}
+RpcAgent::RpcAgent(WorkerInfo workerId, std::unique_ptr<RequestCallback> cb)
+    : workerInfo_(std::move(workerId)), cb_(std::move(cb)) {}
 
 RpcAgent::~RpcAgent() = default;
 
-const WorkerId& RpcAgent::getWorkerId() const {
-  return workerId_;
+const WorkerInfo& RpcAgent::getWorkerInfo() const {
+  return workerInfo_;
 }
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/rpc_agent.h b/torch/csrc/distributed/rpc/rpc_agent.h
index f539a526f1534..35eb2b53402fe 100644
--- a/torch/csrc/distributed/rpc/rpc_agent.h
+++ b/torch/csrc/distributed/rpc/rpc_agent.h
@@ -12,9 +12,9 @@ namespace distributed {
 namespace rpc {
 
 // A globally unique ID to identify an RpcAgent
-struct WorkerId {
-  WorkerId(std::string name, int id)
-      : WorkerId(std::move(name), (worker_id_t)id) {
+struct WorkerInfo {
+  WorkerInfo(std::string name, int id)
+      : WorkerInfo(std::move(name), (worker_id_t)id) {
     TORCH_CHECK(
         id <= std::numeric_limits<worker_id_t>::max(),
         "RPC worker id ",
@@ -22,7 +22,8 @@ struct WorkerId {
         " out of bound of int16_t.");
   }
 
-  WorkerId(std::string name, worker_id_t id) : name_(std::move(name)), id_(id) {
+  WorkerInfo(std::string name, worker_id_t id)
+      : name_(std::move(name)), id_(id) {
     bool validSize = name_.length() < MAX_NAME_LEN && name_.length() > 0;
     bool validChar =
         std::find_if(name_.begin(), name_.end(), [](char c) {
@@ -51,9 +52,9 @@ struct WorkerId {
 // construction.
 class RpcAgent {
  public:
-  // `WorkerId` is the globally unique identifier for this RpcAgent instance. It
-  // contains a ``name_`` field and an ``id_`` field. ``name_`` is the globally
-  // unique name for this ``RpcAgent``. It is up to the ``RpcAgent``
+  // `WorkerInfo` is the globally unique identifier for this RpcAgent instance.
+  // It contains a ``name_`` field and an ``id_`` field. ``name_`` is the
+  // globally unique name for this ``RpcAgent``. It is up to the ``RpcAgent``
   // implementation to determine how to resolve names. ``id_`` is the globally
   // unique ID for this ``RpcAgent``. This should be determined by the
   // ``RpcAgent`` implementation.
@@ -61,7 +62,7 @@ class RpcAgent {
   // ``RpcAgent`` base class makes no assumption on the thread-safeness of the
   // ``RequestCallback``. ``RpcAgent`` implementations need to make sure that
   // its threading model conform to ``RequestCallback``'s requirement.
-  RpcAgent(WorkerId id, std::unique_ptr<RequestCallback> cb);
+  RpcAgent(WorkerInfo id, std::unique_ptr<RequestCallback> cb);
 
   virtual ~RpcAgent();
 
@@ -73,19 +74,20 @@ class RpcAgent {
   // when the response arrives. For other message types, the Future should be
   // ignored by the caller.
   virtual std::shared_ptr<FutureMessage> send(
-      const WorkerId& to,
+      const WorkerInfo& to,
       Message&& message) = 0;
 
-  // Return a reference to the ``WorkerId`` of this RpcAgent.
+  // Return a reference to the ``WorkerInfo`` of this RpcAgent.
   // NB: not using ``c10::optional<const std::string&>`` here because we might
   // need to create a separate RPC API lib and avoid forcing all ``RpcAgent``
   // implementations to depend on libtorch.
-  const WorkerId& getWorkerId() const;
+  const WorkerInfo& getWorkerInfo() const;
 
-  // Return a reference to the ``WorkerId`` of the given ``workerName``.
-  virtual const WorkerId& getWorkerId(const std::string& workerName) const = 0;
+  // Return a reference to the ``WorkerInfo`` of the given ``workerName``.
+  virtual const WorkerInfo& getWorkerInfo(
+      const std::string& workerName) const = 0;
 
-  virtual const WorkerId& getWorkerId(worker_id_t id) const = 0;
+  virtual const WorkerInfo& getWorkerInfo(worker_id_t id) const = 0;
 
   // Call sync and join all internal threads. This method should be called
   // before every RPC process exits.
@@ -96,7 +98,7 @@ class RpcAgent {
   virtual void sync() = 0;
 
  protected:
-  const WorkerId workerId_;
+  const WorkerInfo workerInfo_;
   const std::string workerName_;
   const std::unique_ptr<RequestCallback> cb_;
 };
diff --git a/torch/csrc/distributed/rpc/rref.cpp b/torch/csrc/distributed/rpc/rref.cpp
index e81281b3a7fa2..fc34f1b2d7974 100644
--- a/torch/csrc/distributed/rpc/rref.cpp
+++ b/torch/csrc/distributed/rpc/rref.cpp
@@ -1,11 +1,27 @@
 #include <torch/csrc/distributed/rpc/rref.h>
+
+#include <torch/csrc/distributed/rpc/python_rpc_handler.h>
 #include <torch/csrc/distributed/rpc/rref_context.h>
-#include <torch/csrc/distributed/rpc/script_rref_proto.h>
+#include <torch/csrc/distributed/rpc/rref_proto.h>
 
 namespace torch {
 namespace distributed {
 namespace rpc {
 
+namespace {
+
+constexpr int OWNER_IDX = 0; // index of ownerId in the tuple
+constexpr int RREFID_ON_IDX = 1; // index of RRefId.createdOn_ in the tuple
+constexpr int RREFID_ID_IDX = 2; // index of RRefId.localId_ in the tuple
+constexpr int FORKID_ON_IDX = 3; // index of ForkId.createdOn_ in the tuple
+constexpr int FORKID_ID_IDX = 4; // index of ForkId.localId_ in the tuple
+constexpr int PARENT_IDX = 5; // index of parent in the tuple
+
+// NB: if more fields are added, make sure this field is also bumped
+constexpr int RFD_TUPLE_SIZE = 6; // number of RRefForkData fields in py::tuple
+
+} // namespace
+
 std::atomic<local_id_t> RRefContext::nextLocalId_{0};
 
 //////////////////////////  RRefForkData  /////////////////////////////////
@@ -13,27 +29,47 @@ std::atomic<local_id_t> RRefContext::nextLocalId_{0};
 RRefForkData::RRefForkData(
     worker_id_t ownerId,
     const RRefId& rrefId,
-    const ForkId& forkId)
-    : ownerId_(ownerId), rrefId_(rrefId), forkId_(forkId) {}
-
-at::IValue RRefForkData::toIValue() const {
-  std::vector<at::IValue> ivalues = {
-      (int64_t)ownerId_, rrefId_.toIValue(), forkId_.toIValue()};
+    const ForkId& forkId,
+    worker_id_t parent)
+    : ownerId_(ownerId), rrefId_(rrefId), forkId_(forkId), parent_(parent) {}
+
+py::tuple RRefForkData::toPyTuple() const {
+  return py::make_tuple(
+      ownerId_,
+      rrefId_.createdOn_,
+      rrefId_.localId_,
+      forkId_.createdOn_,
+      forkId_.localId_,
+      parent_);
+}
 
-  return c10::ivalue::Tuple::create(std::move(ivalues));
+RRefForkData RRefForkData::fromPyTuple(const py::tuple& t) {
+  TORCH_INTERNAL_ASSERT(
+      t.size() == RFD_TUPLE_SIZE,
+      "Pickled RRefForkData must contain 6 numbers.");
+  worker_id_t ownerId = t[OWNER_IDX].cast<worker_id_t>();
+  // const reference will extend the lifetime of the temporary variable
+  const RRefId& rrefId = RRefId(
+      t[RREFID_ON_IDX].cast<worker_id_t>(),
+      t[RREFID_ID_IDX].cast<local_id_t>());
+  const RRefId& forkId = RRefId(
+      t[FORKID_ON_IDX].cast<worker_id_t>(),
+      t[FORKID_ID_IDX].cast<local_id_t>());
+  worker_id_t parent = t[PARENT_IDX].cast<worker_id_t>();
+  return RRefForkData(ownerId, rrefId, forkId, parent);
 }
 
 RRefForkData RRefForkData::fromIValue(const at::IValue& ivalue) {
   auto ivalues = ivalue.toTuple()->elements();
 
-  TORCH_CHECK(
-      ivalues.size() == 3,
+  TORCH_INTERNAL_ASSERT(
+      ivalues.size() == 4,
       "Constructing RRefForkData from ivalue "
-      "expects a GenericList of 3 elements, but got ",
+      "expects a GenericList of 4 elements, but got ",
       ivalues.size());
 
   int64_t ownerId = ivalues[0].toInt();
-  TORCH_CHECK(
+  TORCH_INTERNAL_ASSERT(
       ownerId < std::numeric_limits<worker_id_t>::max(),
       "RRefId createdOn out of range, got ",
       ownerId);
@@ -41,7 +77,12 @@ RRefForkData RRefForkData::fromIValue(const at::IValue& ivalue) {
   RRefId rrefId = RRefId::fromIValue(ivalues[1]);
   ForkId forkId = ForkId::fromIValue(ivalues[2]);
 
-  return RRefForkData(ownerId, rrefId, forkId);
+  int64_t parent = ivalues[3].toInt();
+  TORCH_INTERNAL_ASSERT(
+      parent < std::numeric_limits<worker_id_t>::max(),
+      "RRefId createdOn out of range, got ",
+      parent);
+  return RRefForkData(ownerId, rrefId, forkId, parent);
 }
 
 //////////////////////////////  RRef  /////////////////////////////////////
@@ -49,71 +90,100 @@ RRefForkData RRefForkData::fromIValue(const at::IValue& ivalue) {
 RRef::RRef(worker_id_t ownerId, const RRefId& rrefId)
     : ownerId_(ownerId), rrefId_(rrefId) {}
 
-worker_id_t RRef::owner() const {
-  return ownerId_;
-}
-
-const RRefId& RRef::id() const {
-  return rrefId_;
-}
-
-at::IValue RRef::fork() const {
+RRefForkData RRef::fork() const {
+  auto& ctx = RRefContext::getInstance();
   return RRefForkData(
-             ownerId_, rrefId_, RRefContext::getInstance()->genRRefId())
-      .toIValue();
-  // NB: does not support sharing RRefs between users
-  // TODO: notify the owner
+      ownerId_, rrefId_, ctx->genGloballyUniqueId(), ctx->getWorkerId());
 }
 
 //////////////////////////  UserRRef  /////////////////////////////////////
 
-UserRRef::UserRRef(
+template <typename T>
+UserRRef<T>::UserRRef(
     worker_id_t ownerId,
     const RRefId& rrefId,
     const ForkId& forkId)
     : RRef(ownerId, rrefId), forkId_(forkId) {
-  AT_ASSERT(
-      !(forkId_ == rrefId_),
-      "User RRef's fork ID should not be the same as its rref Id");
-  if (RRefContext::getInstance()->getWorkerId() == rrefId_.createdOn_) {
-    // creator user, notify owner.
-    auto& agent = RRefContext::getInstance()->agent();
-    agent->send(
-        agent->getWorkerId(ownerId_),
-        ScriptRRefCreate(RRefForkData(ownerId_, rrefId_, forkId_).toIValue())
-            .toMessage());
-  } else {
-    AT_ERROR("Does not support sharing RRefs between users yet");
-  }
+  // Do nothing,
+  // (1) If this UserRRef is a fork of an existing RRef, RRefContext will send
+  //     a RREF_FORK_REQUEST message to the owner.
+  // (2) If this the creator UserRRef, ScriptRemoteCall or PythonRemoteCall will
+  //     properly notify the owner.
 }
 
-UserRRef::~UserRRef() {
+template <typename T>
+UserRRef<T>::~UserRRef() {
+  // TODO: queue this in RRefContext instead of doing it here.
   auto& ctx = RRefContext::getInstance();
   if (ctx->getWorkerId() != ownerId_) {
-    ctx->agent()->send(
-        ctx->agent()->getWorkerId(ownerId_),
-        ScriptRRefDelete(RRefForkData(ownerId_, rrefId_, forkId_).toIValue())
-            .toMessage());
+    auto fm = ctx->agent()->send(
+        ctx->agent()->getWorkerInfo(ownerId_),
+        RRefUserDelete(rrefId_, forkId_).toMessage());
+
+    fm->addCallback(
+        [](const Message& message) { RRefContext::handleException(message); });
   }
 }
 
-const ForkId& UserRRef::forkId() const {
+template <typename T>
+const ForkId& UserRRef<T>::forkId() const {
   return forkId_;
 }
 
-bool UserRRef::isOwner() const {
-  return false;
+template <>
+IValue UserRRef<IValue>::toHere() {
+  auto& agent = RRefContext::getInstance()->agent();
+  std::shared_ptr<FutureMessage> fm = agent->send(
+      agent->getWorkerInfo(ownerId_),
+      ScriptRRefFetchCall(rrefId()).toMessage());
+  const Message& message = fm->wait();
+  RRefContext::handleException(message);
+  auto rfr = RRefFetchRet::fromMessage(message);
+  TORCH_INTERNAL_ASSERT(
+      rfr->values().size() == 1,
+      "RRef of IValue should contain a single IValue, but got ",
+      rfr->values().size());
+  return rfr->values().front();
 }
 
-IValue UserRRef::toHere() {
+template <>
+py::object UserRRef<py::object>::toHere() {
   auto& agent = RRefContext::getInstance()->agent();
   std::shared_ptr<FutureMessage> fm = agent->send(
-      agent->getWorkerId(ownerId_),
-      ScriptRRefFetchCall(id().toIValue()).toMessage());
-  auto srv = ScriptRRefFetchRet::fromMessage(fm->wait());
-  return srv.value();
+      agent->getWorkerInfo(ownerId_),
+      PythonRRefFetchCall(rrefId()).toMessage());
+  const Message& message = fm->wait();
+  RRefContext::handleException(message);
+  auto rfr = RRefFetchRet::fromMessage(message);
+  return PythonRpcHandler::getInstance().deserialize(
+      SerializedPyObj::fromIValues(rfr->values()));
 }
 
+template class UserRRef<IValue>;
+template class UserRRef<py::object>;
+
+//////////////////////////  OwnerRRef  /////////////////////////////////////
+
+template <typename T>
+const T& OwnerRRef<T>::getValue() const {
+  // TODO: use callback to make this non-blocking
+  std::unique_lock<std::mutex> lock(mutex_);
+  valueCV_.wait(lock, [this] { return value_.has_value(); });
+  return value_.value();
+}
+
+template <typename T>
+void OwnerRRef<T>::setValue(T&& value) {
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    value_ = std::move(value);
+  }
+  valueCV_.notify_all();
+}
+
+template class OwnerRRef<IValue>;
+template class OwnerRRef<py::object>;
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/rref.h b/torch/csrc/distributed/rpc/rref.h
index 03fa11a25c629..53d557a49deed 100644
--- a/torch/csrc/distributed/rpc/rref.h
+++ b/torch/csrc/distributed/rpc/rref.h
@@ -4,6 +4,7 @@
 #include <torch/csrc/distributed/rpc/message.h>
 #include <torch/csrc/distributed/rpc/rpc_agent.h>
 #include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/utils/pybind.h>
 
 #include <atomic>
 
@@ -13,71 +14,245 @@ namespace rpc {
 
 class RRef;
 class RRefContext;
+template <typename T>
 class UserRRef;
 
 // Represents fork of an RRef to be sent over the wire.
-//
-// In order to preserve correctness of reference counting, each RRefForkData
-// **MUST** be deserialized into a RRef. This means that if RRefForkData is to
-// be transferred across the network, we need the guarantee that the message
-// will *eventually* get to the peer,  and that the peer will create a RRef out
-// of it. Therefore, no constructor of RRefForkData is exposed, and
-// applications should never directly use RRefForkData. All construction are
-// done within ``RRef`` and ``RRefContext``.
 struct RRefForkData {
-  at::IValue toIValue() const;
+  py::tuple toPyTuple() const;
+  static RRefForkData fromPyTuple(const py::tuple& obj);
+
+  const worker_id_t ownerId_;
+  const RRefId rrefId_;
+  const ForkId forkId_;
+  const worker_id_t parent_;
 
  private:
   friend class RRef;
   friend class RRefContext;
+  template <typename T>
   friend class UserRRef;
 
   RRefForkData(
       worker_id_t ownerId,
       const RRefId& rrefId_,
-      const ForkId& forkId_);
+      const ForkId& forkId_,
+      worker_id_t parent);
 
   static RRefForkData fromIValue(const at::IValue&);
-
-  const worker_id_t ownerId_;
-  const RRefId rrefId_;
-  const ForkId forkId_;
 };
 
 static_assert(
     C10_IS_TRIVIALLY_COPYABLE(RRefForkData),
     "RRefForkData must be trivially copyable");
 
+// Note [RRef Protocol]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~
+//
+// [Background]
+//
+// RRef stands for Remote REFerence. Each RRef is owned by a single worker
+// (i.e., owner) and can be used by multiple users. The owner stores the real
+// data referenced by its RRefs. RRef needs to support fast and scalable RPC.
+// Hence, in the design, we avoid using a single global master to keep RRef
+// states, instead owners will keep track of the global reference counts
+// for its RRefs. Every RRef can be uniquely identified by a global RRefId,
+// which is assigned at the time it is first created either on a user or on the
+// owner.
+//
+// On the owner worker, there is only one OwnerRRef instance, which contains the
+// real data, while on user workers, there can be as many UserRRefs as
+// necessary, and UserRRef does not hold the data. All usage on the OwnerRRef
+// should retrieve the unique OwnerRRef instance using the globally unique
+// RRefId. //A UserRRef will be created when it is used as an argument or return
+// value in dist.rpc or dist.remote call, but RRef forking and reference
+// counting (RC) are completely transparent to applications. Every UserRRef will
+// also have its globally unique ForkId.
+//
+// [Assumptions]
+//
+// 1. Transient Network Failures
+//
+// TODO: current RRef implementation does not tolerate failures
+//
+// The RRef design aims to handle transient network failures by retrying
+// messages. Node crashes or permanent network partition is beyond the scope.
+// When those incidents occur, the application may take down all workers, revert
+// to the previous checkpoint, and resume training.
+//
+// 2. Non-idempotent UDFs
+//
+// We assume UDFs are not idempotent and therefore cannot be retried. However,
+// internal RRef control messages will be made idempotent and retryable.
+//
+// TODO: RRef internal messages are not yet idempotent
+//
+// 3. Out of Order Message Delivery
+//
+// We do not assume message delivery order between any pair of nodes, because
+// both sender and receiver are using multiple threads. There is no guarantee on
+// which message will be processed first.
+//
+// [RRef Lifetime]
+//
+// The goal of the protocol is to delete an OwnerRRef at an appropriate time.
+// The right time to delete an OwnerRRef is when there are no living UserRRefs
+// and Python GC also agrees to delete the OwnerRRef instance on the owner. The
+// tricky part is to determine if there are any living UserRRefs.
+//
+// A user can get a UserRRef in three situations:
+//
+// (1). Receiving a UserRRef from the owner.
+// (2). Receiving a UserRRef from another user.
+// (3). Creating a new UserRRef owned by another worker.
+//
+// (1) is the simplest case where the owner initiates the fork, and hence it can
+// easily increment local RC. The only requirement is that any UserRRef must
+// notify the owner before destruction. Hence, we need the first guarantee:
+//
+// G1. The owner will be notified when any UserRRef is deleted.
+//
+// As messages might come delayed or out-of-order, we need more one guarantee to
+// make sure the delete message is not sent out too soon. Let us first introduce
+// a new concept. If A sends an RPC to B that involves an RRef, we call the RRef
+// on A the parent RRef and the RRef on B the child RRef.
+//
+// G2. Parent RRef cannot be deleted until the child RRef is confirmed by the
+//     owner.
+//
+// Under (1), where the caller is UserRRef and callee is OwnerRRef, it simply
+// means that the user will not send out the delete message until all previous
+// messages are ACKed. Note that ACKed does not mean the owner finishes
+// executing the function, instead, it only means the owner has retrieved its
+// local OwnerRRef and about to pass it to the function, which is sufficient to
+// keep the OwnerRRef alive even if the delete message from the user arrives at
+// the owner before the function finishes execution.
+//
+// With (2) and (3), it is possible that the owner only partially knows the RRef
+// fork graph or not even knowing it at all. For example, the RRef could be
+// constructed on a user, and before the owner receives the RPC call, the
+// creator user might have already shared the RRef with other users, and those
+// users could further share the RRef. One invariant is that the fork graph of
+// any RRef is always a tree rooted at the owner, because forking an RRef always
+// creates a new RRef instance, and hence every RRef has a single parent. One
+// nasty detail is that when an RRef is created on a user, technically the owner
+// is not its parent but we still consider it that way and it does not break the
+// argument below.
+//
+// The owner's view on any node (fork) in the tree has three stages:
+//
+//       1) unknown -> 2) known -> 3) deleted.
+//
+// The owner's view on the entire tree keeps changing. The owner deletes its
+// OwnerRRef instance when it thinks there are no living UserRRefs, i.e., when
+// OwnerRRef is deleted, all UserRRefs could be either indeed deleted or
+// unknown. The dangerous case is when some forks are unknown and others are
+// deleted.
+//
+// G2 trivially guarantees that no parent UserRRef Y can be deleted before the
+// owner knows all of Y's children UserRRefs.
+//
+// However, it is possible that the child UserRRef Z may be deleted before the
+// owner knows its parent Y. More specifically, this can happen when all of Z's
+// messages are processed by the owner before all messages from Y, including the
+// delete message. Nevertheless, this does not cause any problem. Because, at
+// least one of Y's ancestor will be alive, and it will prevent the owner from
+// deleting the OwnerRRef. Consider the following example: (NB: this scenario
+// will no longer relevant when we block UDF until all RRefs are confirmed by
+// the owner)
+//
+//     OwnerRRef -> A -> Y -> Z
+//
+// OwnerRRef forks to A, then A forks to Y, and Y forks to Z. Z can be deleted
+// without OwnerRRef knowing Y. However, the OwnerRRef will at least know A, as
+// the owner directly forks the RRef to A. A won't die before the owner knows Y.
+//
+// Things get a little trickier if the RRef is created on a user:
+//
+//  OwnerRRef
+//      ^
+//      |
+//      A -> Y -> Z
+//
+// If Z calls to_here on the UserRRef, the owner at least knows A when Z is
+// deleted, because otherwise to_here wouldn't finish. If Z does not call
+// to_here, it is possible that the owner receives all messages from Z before
+// any message from A and Y. In this case, as the real data of the OwnerRRef has
+// not been created yet, there is nothing to be deleted either. It is the same
+// as Z does not exist at all Hence, it's still OK.
+//
+// See #26759 for more details and discussions.
+//
 // TODO: make RRef an IValue, and edit createStackForSchema accordingly
+// TODO: make RRef system messages idempotent and retry on failures.
+//
+// ``RRef`` is the base type for both ``UserRRef`` and ``OwnerRRef``.
+// Each ``RRef`` has a globally unique ``RRefId``.
 class RRef {
  public:
   // RRef is made NOT copyable NOT movable to prevent messing up reference
-  // counting
+  // counting.
   RRef(const RRef& other) = delete;
   RRef(RRef&& other) = delete;
+  RRef& operator=(RRef&& other) = delete;
 
   virtual ~RRef() = default;
 
-  worker_id_t owner() const;
-  const RRefId& id() const;
-  IValue fork() const;
+  // returns the worker id of the owner
+  inline worker_id_t owner() const {
+    return ownerId_;
+  }
+
+  // Returns the globally unique RRefId of this RRef
+  inline const RRefId& rrefId() const {
+    return rrefId_;
+  }
 
+  // Returns true if this is the ``OwnerRRef``
   virtual bool isOwner() const = 0;
-  virtual IValue toHere() = 0;
+
+  // returns true if this RRef holds an py::object, false if IValue
+  virtual bool isPyObj() = 0;
 
  protected:
+  friend class RRefContext;
+
   RRef(worker_id_t ownerId, const RRefId& rrefId);
 
+  RRefForkData fork() const;
+
   const worker_id_t ownerId_;
   const RRefId rrefId_;
 };
 
+// ``UserRRef`` represents a user of an RRef. Besides the ``RRefId``, each user
+// also has a globally unique ``ForkId`` to identify this user. ``UserRRef``
+// never owns the real value, the only way to get the value of the ``RRef`` is
+// to call ``to_here()`` and get a copy..
+template <typename T>
 class UserRRef final : public RRef {
  public:
+  UserRRef(const UserRRef& other) = delete;
+  UserRRef(UserRRef&& other) = delete;
+  UserRRef& operator=(const UserRRef& other) = delete;
+  UserRRef& operator=(UserRRef&& other) = delete;
+
+  inline bool isOwner() const override {
+    return false;
+  }
+
+  inline bool isPyObj() override {
+    return std::is_same<T, py::object>::value;
+  }
+
+  // Returns the globally unique ForkId of this RRef
   const ForkId& forkId() const;
-  bool isOwner() const override;
-  IValue toHere() override;
 
+  // Get of copy of the value from the ``OwnerRRef``. If the value is not ready
+  // yet, this call will block.
+  T toHere();
+
+  // Upon destruction, this ``UserRRef`` will tell the owner to deref.
   ~UserRRef() override;
 
  private:
@@ -93,28 +268,27 @@ class UserRRef final : public RRef {
 template <typename T>
 class OwnerRRef final : public RRef {
  public:
-  bool isOwner() const override {
+  OwnerRRef(const OwnerRRef& other) = delete;
+  OwnerRRef(OwnerRRef&& other) = delete;
+  OwnerRRef& operator=(const OwnerRRef& other) = delete;
+  OwnerRRef& operator=(OwnerRRef&& other) = delete;
+
+  inline bool isOwner() const override {
     return true;
   }
 
-  T getValue() const {
-    // TODO: use callback to make this non-blocking
-    std::unique_lock<std::mutex> lock(mutex_);
-    valueCV_.wait(lock, [this] { return value_.has_value(); });
-    return value_.value();
+  inline bool isPyObj() override {
+    return std::is_same<T, py::object>::value;
   }
 
-  void setValue(T&& value) {
-    {
-      std::lock_guard<std::mutex> lock(mutex_);
-      value_ = std::move(value);
-    }
-    valueCV_.notify_all();
-  }
+  // Get a constant reference of the real value. This method will block if the
+  // value is not ready. This method does not need GIL as it does not create
+  // any new py::object.
+  const T& getValue() const;
 
-  IValue toHere() override {
-    AT_ERROR("OwnerRRef does not support toHere(), use getValue() instead.");
-  }
+  // Set the value of this ``OwnerRRef``. This method does not need GIL as it
+  // does not create any new py::object.
+  void setValue(T&& value);
 
  private:
   friend class RRefContext;
@@ -122,9 +296,6 @@ class OwnerRRef final : public RRef {
   OwnerRRef(worker_id_t ownerId, const RRefId& rrefId)
       : OwnerRRef(ownerId, rrefId, {}) {}
 
-  OwnerRRef(OwnerRRef<T>&& other) noexcept
-      : OwnerRRef(other.owner(), other.id(), std::move(other.value_)) {}
-
   OwnerRRef(worker_id_t ownerId, const RRefId& rrefId, c10::optional<T> value)
       : RRef(ownerId, rrefId) {
     value_ = std::move(value);
diff --git a/torch/csrc/distributed/rpc/rref_context.cpp b/torch/csrc/distributed/rpc/rref_context.cpp
index dc7302e8adce9..5f04af9cbf30c 100644
--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@@ -1,10 +1,13 @@
 #include <torch/csrc/distributed/rpc/rref_context.h>
+#include <torch/csrc/distributed/rpc/rref_proto.h>
+
+#include <sstream>
 
 namespace torch {
 namespace distributed {
 namespace rpc {
 
-std::unique_ptr<RRefContext> RRefContext::context_;
+std::unique_ptr<RRefContext> RRefContext::context_ = nullptr;
 
 void RRefContext::initInstance(std::shared_ptr<RpcAgent> agent) {
   TORCH_CHECK(!RRefContext::context_, "Can only initialize RRefContext once.");
@@ -20,50 +23,291 @@ std::unique_ptr<RRefContext>& RRefContext::getInstance() {
   return RRefContext::context_;
 }
 
+void RRefContext::destroyInstance() {
+  RRefContext::getInstance()->checkRRefLeaks();
+  RRefContext::context_.reset();
+}
+
+void RRefContext::handleException(const Message& message) {
+  if (message.type() == MessageType::EXCEPTION) {
+    // TODO: allow users to register an error handler and call it here.
+    std::string err(message.payload().begin(), message.payload().end());
+    VLOG(1) << "Got exception: " << err << std::endl << std::flush;
+    throw std::runtime_error(err);
+  }
+}
+
 RRefContext::RRefContext(std::shared_ptr<RpcAgent> agent)
     : agent_(std::move(agent)) {}
 
-worker_id_t RRefContext::getWorkerId() const {
-  return agent_->getWorkerId().id_;
+RRefContext::~RRefContext() {
+  if (!owners_.empty()) {
+    AutoGIL ag;
+    owners_.clear();
+  }
+}
+
+void RRefContext::checkRRefLeaks() {
+  if (!forks_.empty()) {
+    std::stringstream ss;
+    for (auto& entry : forks_) {
+      const RRefId& rrefId = entry.first;
+      for (const auto& forkId : entry.second) {
+        ss << "Leaking RRef " << rrefId << " with fork Id " << forkId
+           << std::endl;
+      }
+    }
+    AT_ERROR(ss.str());
+  }
 }
 
-RRefId RRefContext::genRRefId() {
-  return RRefId(getWorkerId(), nextLocalId_++);
+template <typename T>
+std::shared_ptr<UserRRef<T>> RRefContext::createUserRRef(worker_id_t ownerId) {
+  TORCH_CHECK(ownerId != getWorkerId(), "Cannot create UserRRef on owner.");
+  return createUserRRef<T>(
+      ownerId, genGloballyUniqueId(), genGloballyUniqueId());
 }
 
-const std::shared_ptr<RpcAgent>& RRefContext::agent() const {
-  return agent_;
+template std::shared_ptr<UserRRef<IValue>> RRefContext::createUserRRef<IValue>(
+    worker_id_t ownerId);
+
+template std::shared_ptr<UserRRef<py::object>> RRefContext::createUserRRef<
+    py::object>(worker_id_t ownerId);
+
+template <typename T>
+std::shared_ptr<UserRRef<T>> RRefContext::createUserRRef(
+    worker_id_t ownerId,
+    const RRefId& rrefId,
+    const ForkId& forkId) {
+  TORCH_CHECK(ownerId != getWorkerId(), "RRef owner cannot create user RRef.");
+  // RRefContext does not track user RRefs, it will be destructed when there
+  // is no shared_ptrs pointing to it.
+  //
+  // NB: cannot use make_shared here as the constructor of UserRRef is private.
+  // NB: This UserRRef has not been confirmed by the owner yet. This function's
+  // call site is responsible for adding this UserRRef to pendingUsers_.
+  // Currently, there are two call sites.
+  // (1) The creator user in python_functions.cpp
+  // (2) The callee user in RRefContext::notifyOwnerAndParentOfFork.
+  //
+  // The reason for not adding the pending user here is to put addPendingUser()
+  // close to where the RPC occurs, and it is more clear to pair it with
+  // deletePendingUser() in the response callback at the call site.
+  return std::shared_ptr<UserRRef<T>>(new UserRRef<T>(ownerId, rrefId, forkId));
 }
 
-void RRefContext::addFork(const at::IValue& value) {
-  auto rfd = RRefForkData::fromIValue(value);
-  AT_ASSERT(
-      rfd.ownerId_ == getWorkerId(),
-      "RRef user should never receive fork notification.");
+template std::shared_ptr<UserRRef<IValue>> RRefContext::createUserRRef<IValue>(
+    worker_id_t ownerId,
+    const RRefId& rrefId,
+    const ForkId& forkId);
+
+template std::shared_ptr<UserRRef<py::object>> RRefContext::createUserRRef<
+    py::object>(
+    worker_id_t ownerId,
+    const RRefId& rrefId,
+    const ForkId& forkId);
+
+template <typename T>
+std::shared_ptr<RRef> RRefContext::getOrCreateRRef(const RRefForkData& rfd) {
+  auto& ownerId = rfd.ownerId_;
+  auto& rrefId = rfd.rrefId_;
+  auto& forkId = rfd.forkId_;
+  if (ownerId == getWorkerId()) {
+    return getOrCreateOwnerRRef<T>(rrefId);
+  } else {
+    return createUserRRef<T>(ownerId, rrefId, forkId);
+  }
+}
+
+template std::shared_ptr<RRef> RRefContext::getOrCreateRRef<IValue>(
+    const RRefForkData& rfd);
+
+template std::shared_ptr<RRef> RRefContext::getOrCreateRRef<py::object>(
+    const RRefForkData& rfd);
+
+template <typename T>
+std::shared_ptr<OwnerRRef<T>> RRefContext::getOrCreateOwnerRRef(
+    const RRefId& rrefId) {
   std::lock_guard<std::mutex> lock(mutex_);
-  auto& rrefForks = forks_[rfd.rrefId_];
-  AT_ASSERT(
-      rrefForks.find(rfd.forkId_) == rrefForks.end(),
-      "Got fork notification twice on the same RRef ",
-      rfd.rrefId_);
-  rrefForks.insert(rfd.forkId_);
+  const auto iter = owners_.find(rrefId);
+  if (iter == owners_.end()) {
+    // Scenario (1) the first time this owner knows about this RRef
+    //
+    // NB: cannot use make_shared here as the constructor of OwnerRRef is
+    // private.
+    auto rref =
+        std::shared_ptr<OwnerRRef<T>>(new OwnerRRef<T>(getWorkerId(), rrefId));
+    owners_[rref->rrefId()] = rref;
+    return rref;
+
+  } else {
+    // Scenario (2) retrieving an existing RRef
+    return std::static_pointer_cast<OwnerRRef<T>>(iter->second);
+  }
+}
+
+template std::shared_ptr<OwnerRRef<IValue>> RRefContext::getOrCreateOwnerRRef<
+    IValue>(const RRefId& rrefId);
+
+template std::shared_ptr<OwnerRRef<py::object>> RRefContext::
+    getOrCreateOwnerRRef<py::object>(const RRefId& rrefId);
+
+RRefForkData RRefContext::prepareChildFork(const std::shared_ptr<RRef>& rref) {
+  auto rfd = rref->fork();
+  if (rref->isOwner()) {
+    // Note [Early Fork Registration]
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // If the parent (caller) is the owner, directly register the fork, instead
+    // of waiting for another RREF_FORK_REQUEST or RREF_CHILD_ACCEPT message. An
+    // Alternative is adding the fork when the callee user ACKs. However, before
+    // that, the owner still have to adds the OwnerRRef into some map to keep it
+    // alive (e.g., in pendingChildren_). Hence, adding the fork here or in the
+    // ACK does not making any difference but only add complexity.
+    // TODO: When adding failure retries and timeout, this fork needs to be
+    // deleted if the owner does not receive the ACK within the timeout.
+    addForkOfOwner(rfd.rrefId_, rfd.forkId_);
+  } else {
+    // Note [Useful Phantom Fork ID for User to Owner Call]
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // If the callee of dist.remote or dist.rpc is the owner of this RRef, the
+    // callee will not create a fork using this rfd.forkId_, because the owner
+    // will only keep one `OwnerRRef` instance and will not create any
+    // `UserRRef` instances. However, this rfd.forkId_ is still necessary, as
+    // the caller user needs to keep this `UserRRef` alive until it gets the
+    // ACK from the callee owner. Otherwise, the delete message could arrive
+    // at the owner before this dist.rpc or dist.remote call, which could
+    // potentially trigger the `OwnerRRef` to be deleted before running the
+    // user code.
+    addPendingChild(rfd.forkId_, rref);
+  }
+  return rfd;
 }
 
-void RRefContext::delFork(const at::IValue& value) {
-  auto rfd = RRefForkData::fromIValue(value);
-  AT_ASSERT(
-      rfd.ownerId_ == getWorkerId(),
-      "RRef user should never receive delete notification.");
+void RRefContext::notifyOwnerAndParentOfFork(
+    const ForkId& forkId,
+    worker_id_t parent,
+    const std::shared_ptr<RRef>& rref) {
+  if (parent == rref->owner()) {
+    // If the parent is the owner, this fork has already been added into the
+    // forks_ map when the owner sends the message to the callee user. Hence,
+    // it is not necessary to send another RREF_CHILD_ACCEPT or
+    // RREF_FORK_REQUEST back to the owner. See Note [Early Fork Registration].
+    return;
+  }
+
+  if (rref->isOwner()) {
+    // See Note [Useful Phantom Fork ID for User to Owner Call]
+    // In this case, the owner is the caller, and it does not add the fork id
+    // into forks_. Because, there will be no real `UserRRef` associated with
+    // this fork ID.
+    auto fm = agent_->send(
+        agent_->getWorkerInfo(parent), RRefChildAccept(forkId).toMessage());
+    fm->addCallback([](const Message& message) { handleException(message); });
+  } else {
+    auto fm = agent_->send(
+        agent_->getWorkerInfo(rref->owner()),
+        RRefForkRequest(rref->rrefId(), forkId).toMessage());
+
+    addPendingUser(forkId, rref);
+    fm->addCallback([this, forkId, parent](const Message& message) {
+      handleException(message);
+      this->finishForkRequest(forkId, parent);
+    });
+  }
+}
+
+void RRefContext::addPendingChild(
+    const ForkId& forkId,
+    const std::shared_ptr<RRef>& rref) {
+  // see Note [Early Fork Registration]
+  // If the parent is the owner, it should directly add the child UserRRef as a
+  // fork.
+  TORCH_INTERNAL_ASSERT(
+      !rref->isOwner(), "OwnerRRef should not have a pending child.");
+  std::lock_guard<std::mutex> lock(mutex_);
+  TORCH_INTERNAL_ASSERT(
+      pendingChildren_.find(forkId) == pendingChildren_.end(),
+      "Inconsistent states: attempt to add the same child fork twice.");
+  pendingChildren_[forkId] = rref;
+}
+
+void RRefContext::delPendingChild(const ForkId& forkId) {
   std::lock_guard<std::mutex> lock(mutex_);
-  auto& rrefForks = forks_[rfd.rrefId_];
-  AT_ASSERT(
-      rrefForks.find(rfd.forkId_) != rrefForks.end(),
-      "Attempt to delete a non-exist fork ",
-      rfd.forkId_);
-  rrefForks.erase(rfd.forkId_);
-  if (rrefForks.empty()) {
-    owners_.erase(rfd.rrefId_);
-    forks_.erase(rfd.rrefId_);
+  auto iter = pendingChildren_.find(forkId);
+  TORCH_INTERNAL_ASSERT(
+      iter != pendingChildren_.end(),
+      "Inconsistent states: attempt to delete a non-exist child fork.");
+  pendingChildren_.erase(iter);
+}
+
+void RRefContext::addPendingUser(
+    const ForkId& forkId,
+    const std::shared_ptr<RRef>& rref) {
+  TORCH_INTERNAL_ASSERT(
+      !rref->isOwner(), "Attempt to add an OwnerRRef as a pending User.");
+  std::lock_guard<std::mutex> lock(mutex_);
+  TORCH_INTERNAL_ASSERT(
+      pendingUsers_.find(forkId) == pendingUsers_.end(),
+      "Inconsistent states: attempt to add the same UserRRef twice.");
+  pendingUsers_[forkId] = rref;
+}
+
+void RRefContext::delPendingUser(const ForkId& forkId) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto iter = pendingUsers_.find(forkId);
+  TORCH_INTERNAL_ASSERT(
+      iter != pendingUsers_.end(),
+      "Inconsistent states: attempt to delete a non-exist UserRRef.");
+  pendingUsers_.erase(iter);
+}
+
+void RRefContext::finishForkRequest(const ForkId& forkId, worker_id_t parent) {
+  delPendingUser(forkId);
+  auto fm = agent_->send(
+      agent_->getWorkerInfo(parent), RRefChildAccept(forkId).toMessage());
+
+  fm->addCallback([](const Message& message) { handleException(message); });
+}
+
+void RRefContext::addForkOfOwner(const RRefId& rrefId, const ForkId& forkId) {
+  std::lock_guard<std::mutex> lock(mutex_);
+  auto& rrefForks = forks_[rrefId];
+  TORCH_INTERNAL_ASSERT(
+      rrefForks.find(forkId) == rrefForks.end(),
+      "Got fork notification twice on the same RRef ",
+      forkId);
+  rrefForks.insert(forkId);
+}
+
+void RRefContext::delForkOfOwner(const RRefId& rrefId, const ForkId& forkId) {
+  std::shared_ptr<RRef> deletedRRef = nullptr;
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    auto rrefIter = forks_.find(rrefId);
+    TORCH_INTERNAL_ASSERT(
+        rrefIter != forks_.end(),
+        "Inconsistent states, deleting a fork before the owner knows it.");
+    auto& rrefForks = rrefIter->second;
+    auto forkIter = rrefForks.find(forkId);
+    TORCH_INTERNAL_ASSERT(
+        forkIter != rrefForks.end(),
+        "Attempt to delete a non-exist fork ",
+        forkId);
+
+    rrefForks.erase(forkId);
+
+    if (rrefForks.empty()) {
+      auto ownerIter = owners_.find(rrefId);
+      if (ownerIter != owners_.end()) {
+        deletedRRef = ownerIter->second;
+        owners_.erase(ownerIter);
+      }
+      forks_.erase(rrefIter);
+    }
+  }
+  if (deletedRRef && deletedRRef->isPyObj()) {
+    AutoGIL ag;
+    deletedRRef.reset();
   }
 }
 
diff --git a/torch/csrc/distributed/rpc/rref_context.h b/torch/csrc/distributed/rpc/rref_context.h
index e18967416eb24..50247e8def4b7 100644
--- a/torch/csrc/distributed/rpc/rref_context.h
+++ b/torch/csrc/distributed/rpc/rref_context.h
@@ -17,90 +17,103 @@ class RRefContext {
  public:
   static void initInstance(std::shared_ptr<RpcAgent>);
   static std::unique_ptr<RRefContext>& getInstance();
+  static void destroyInstance();
+
+  static void handleException(const Message& message);
 
   RRefContext(const RRefContext&) = delete;
+  RRefContext(RRefContext&& other) = delete;
   void operator=(const RRefContext&) = delete;
+  RRefContext& operator=(RRefContext&& other) = delete;
 
-  worker_id_t getWorkerId() const;
-  RRefId genRRefId();
-  const std::shared_ptr<RpcAgent>& agent() const;
+  ~RRefContext();
 
-  // create a new RRef
-  template <typename T>
-  std::shared_ptr<OwnerRRef<T>> createOwnerRRef(worker_id_t ownerId) {
-    TORCH_CHECK(ownerId == getWorkerId(), "Cannot create OwnerRRef on user.");
-    return getOrCreateOwnerRRef<T>(genRRefId());
+  // get the worker id of the current worker
+  inline worker_id_t getWorkerId() const {
+    return agent_->getWorkerInfo().id_;
   }
 
-  std::shared_ptr<UserRRef> createUserRRef(worker_id_t ownerId) {
-    TORCH_CHECK(ownerId != getWorkerId(), "Cannot create UserRRef on owner.");
-    return createUserRRef(ownerId, genRRefId(), genRRefId());
+  // get the worker name of the current worker
+  inline const std::string& getWorkerName() const {
+    return agent_->getWorkerInfo().name_;
   }
 
-  std::shared_ptr<UserRRef> createUserRRef(
-      worker_id_t ownerId,
-      const RRefId& rrefId,
-      const ForkId& forkId) {
-    TORCH_CHECK(
-        ownerId != getWorkerId(), "RRef owner cannot create user RRef.");
-    // RRefContext does not track user RRefs, it will be destructed when there
-    // is no shared_ptrs pointing to it. NB: cannot use make_shared here as the
-    // constructor of UserRRef is private
-    return std::shared_ptr<UserRRef>(new UserRRef(ownerId, rrefId, forkId));
+  //  generate a globally unique ID
+  inline GloballyUniqueId genGloballyUniqueId() {
+    return GloballyUniqueId(getWorkerId(), nextLocalId_++);
   }
 
-  // get an existing RRef or create a new one from a serialized
-  // ``RRefForkData``.
-  template <typename T>
-  std::shared_ptr<RRef> getOrCreateRRef(at::IValue&& value) {
-    auto rfd = RRefForkData::fromIValue(std::move(value));
-    return getOrCreateRRef<T>(rfd.ownerId_, rfd.rrefId_, rfd.forkId_);
+  inline const std::shared_ptr<RpcAgent>& agent() const {
+    return agent_;
   }
 
+  // create a ``UserRRef`` owned by the worker ``ownerId``
   template <typename T>
-  std::shared_ptr<RRef> getOrCreateRRef(
-      worker_id_t ownerId,
-      const RRefId& rrefId,
-      const ForkId& forkId) {
-    if (ownerId == getWorkerId()) {
-      return getOrCreateOwnerRRef<T>(rrefId);
-    } else {
-      return createUserRRef(ownerId, rrefId, forkId);
-    }
-  }
+  std::shared_ptr<UserRRef<T>> createUserRRef(worker_id_t ownerId);
 
+  // Convert an RRefForkData into an RRef. This RRef could be user or owner.
+  // This RRef could have already existed before, or could be created in this
+  // method.
   template <typename T>
-  std::shared_ptr<OwnerRRef<T>> getOrCreateOwnerRRef(const RRefId& rrefId) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    const auto iter = owners_.find(rrefId);
-    if (iter == owners_.end()) {
-      // Scenario (1) the first time this owner knows about this RRef
-      // Scenario (2) This owner is also the creator.
-      //
-      // NB: cannot use make_shared here as the constructor of OwnerRRef is
-      // private.
-      auto rref = std::shared_ptr<OwnerRRef<T>>(
-          new OwnerRRef<T>(getWorkerId(), rrefId));
-      owners_[rref->id()] = rref;
-      return rref;
-
-    } else {
-      // Scenario (3) retrieving an existing RRef
-      return std::dynamic_pointer_cast<OwnerRRef<T>>(iter->second);
-    }
-  }
+  std::shared_ptr<RRef> getOrCreateRRef(const RRefForkData& rfd);
 
-  void addFork(const at::IValue& value);
-  void delFork(const at::IValue& value);
+  // Get the ``OwnerRRef`` of id ``rrefId``. If it does not exist, create a new
+  // one.
+  template <typename T>
+  std::shared_ptr<OwnerRRef<T>> getOrCreateOwnerRRef(const RRefId& rrefId);
+
+  // Register a fork of the ``OwnerRRef``, and inserts a shared_ptr of the
+  // ``OwnerRRef`` in a map to keep it alive.
+  void addForkOfOwner(const RRefId& rrefId, const ForkId& forkId);
+  // Delete a fork of the ``OwnerRRef``. NB: this could trigger deletion on the
+  // IValue or py::object. For the later, this method will acquire GIL.
+  void delForkOfOwner(const RRefId& rrefId, const ForkId& forkId);
+
+  // Invoked when pickling an RRef to setup child/fork properly
+  RRefForkData prepareChildFork(const std::shared_ptr<RRef>& rref);
+  // Invoked when unpickling an RRef to send RREF_FORK_REQUEST to owner and
+  // send RREF_CHILD_ACCEPT to the parent.
+  // NB: forkId is necessary here as the rref could be an OwnerRRef
+  void notifyOwnerAndParentOfFork(
+      const ForkId& forkId,
+      worker_id_t parent,
+      const std::shared_ptr<RRef>& rref);
+
+  // When a UserRRef is forked to another worker (user or owner), it is added
+  // into pendingChildren_ to be held alive until it receives RREF_CHILD_ACCEPT
+  // from the child.
+  // NB: This is necessary for both user and owner child. As we do not have FIFO
+  // communication between workers, we need this strategy to make sure that all
+  // previously submitted rpc/remote calls are acked before sending out the
+  // RREF_USER_DELETE message. Otherwise, the OwnerRRef could be deleted too
+  // soon.
+  void addPendingChild(const ForkId& forkId, const std::shared_ptr<RRef>& rref);
+  void delPendingChild(const ForkId& forkId);
+
+  // When a UserRRef is created, it is added into pendingUsers_ to be held alive
+  // until it receives RREF_USER_ACCEPT from the owner.
+  void addPendingUser(const ForkId& forkId, const std::shared_ptr<RRef>& rref);
+  void delPendingUser(const ForkId& forkId);
 
  private:
   RRefContext(std::shared_ptr<RpcAgent>);
 
+  template <typename T>
+  std::shared_ptr<UserRRef<T>> createUserRRef(
+      worker_id_t ownerId,
+      const RRefId& rrefId,
+      const ForkId& forkId);
+
+  void finishForkRequest(const ForkId& forkId, worker_id_t parent);
+
+  // If there is any leak on any RRef, this method will throw an error.
+  void checkRRefLeaks();
+
   static std::unique_ptr<RRefContext> context_;
   static std::atomic<local_id_t> nextLocalId_;
 
   const std::shared_ptr<RpcAgent> agent_;
-  std::mutex mutex_;
+  mutable std::mutex mutex_;
   // Keep OwnerRRefs alive until there is no living UserRRefs.
   std::unordered_map<RRefId, std::shared_ptr<RRef>, RRefId::Hash> owners_;
   // Tracks known living UserRRefs of an OwnerRRef
@@ -109,6 +122,26 @@ class RRefContext {
       std::unordered_set<ForkId, ForkId::Hash>,
       RRefId::Hash>
       forks_;
+
+  // The follow two maps keep UserRRefs alive by holding a shared_ptr to the
+  // RRef instances. A UserRRef must be added into this map if any of the
+  // following two conditions is ture:
+  //
+  // (1) A UserRRef has not been accepted by owner yet.
+  //
+  //     It can be used or shared, but cannot be deleted, and hence kept alive
+  //     in this map. A message of type RREF_USER_ACCEPT will remove the
+  //     corresponding RRef from this map.
+  std::unordered_map<ForkId, std::shared_ptr<RRef>, ForkId::Hash> pendingUsers_;
+
+  // (2) A UserRRef has forked a child UserRRef which has not been accepted by
+  //     the owner yet.
+  //
+  //     In this case, this UserRRef cannot send out RREF_USER_DELETE message,
+  //     as it could potentially trigger the OwnerRRef been deleted before the
+  //     owner learns about the forked child.
+  std::unordered_map<ForkId, std::shared_ptr<RRef>, ForkId::Hash>
+      pendingChildren_;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/rref_proto.cpp b/torch/csrc/distributed/rpc/rref_proto.cpp
new file mode 100644
index 0000000000000..b0d39a061b302
--- /dev/null
+++ b/torch/csrc/distributed/rpc/rref_proto.cpp
@@ -0,0 +1,173 @@
+#include <torch/csrc/distributed/rpc/rref_proto.h>
+#include <torch/csrc/jit/pickle.h>
+
+#include <limits>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+namespace {
+
+std::vector<IValue> toIValues(const Message& message, MessageType type) {
+  TORCH_INTERNAL_ASSERT(
+      type == message.type(),
+      "Expecting message of type ",
+      type,
+      ", but got ",
+      message.type());
+  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload_size = message.payload().size();
+
+  auto value =
+      jit::unpickle(payload, payload_size, nullptr, &message.tensors());
+  return value.toTuple()->elements();
+}
+
+Message fromIValues(std::vector<IValue> ivalues, MessageType type) {
+  std::vector<torch::Tensor> tensor_table;
+  auto payload = jit::pickle(
+      c10::ivalue::Tuple::create(std::move(ivalues)), &tensor_table);
+  return Message(std::move(payload), std::move(tensor_table), type);
+}
+
+} // namespace
+
+/////////////////////////// RRefMessageBase //////////////////////////////////
+
+const RRefId& RRefMessageBase::rrefId() {
+  return rrefId_;
+}
+
+Message RRefMessageBase::toMessage() && {
+  return fromIValues({rrefId_.toIValue()}, type_);
+}
+
+at::IValue RRefMessageBase::fromMessage(
+    const Message& message,
+    MessageType type) {
+  auto values = toIValues(message, type);
+
+  TORCH_INTERNAL_ASSERT(
+      values.size() == 1, "ScriptUserDelete expects 1 IValue from message.");
+  return std::move(values.back());
+}
+
+/////////////////////////// ForkMessageBase //////////////////////////////////
+
+const ForkId& ForkMessageBase::forkId() {
+  return forkId_;
+}
+
+Message ForkMessageBase::toMessage() && {
+  return fromIValues({rrefId_.toIValue(), forkId_.toIValue()}, type_);
+}
+
+std::pair<RRefId, ForkId> ForkMessageBase::fromMessage(
+    const Message& message,
+    MessageType type) {
+  auto ivalues = toIValues(message, type);
+
+  TORCH_INTERNAL_ASSERT(
+      ivalues.size() == 2, "ScriptUserDelete expects 2 IValue from message.");
+
+  return std::make_pair(
+      RRefId::fromIValue(ivalues[0]), ForkId::fromIValue(ivalues[1]));
+}
+
+/////////////////////////// RRef Protocol //////////////////////////////////
+
+std::unique_ptr<ScriptRRefFetchCall> ScriptRRefFetchCall::fromMessage(
+    const Message& message) {
+  return c10::guts::make_unique<ScriptRRefFetchCall>(
+      RRefId::fromIValue(RRefMessageBase::fromMessage(
+          message, MessageType::SCRIPT_RREF_FETCH_CALL)));
+}
+
+std::unique_ptr<PythonRRefFetchCall> PythonRRefFetchCall::fromMessage(
+    const Message& message) {
+  return c10::guts::make_unique<PythonRRefFetchCall>(
+      RRefId::fromIValue(RRefMessageBase::fromMessage(
+          message, MessageType::PYTHON_RREF_FETCH_CALL)));
+}
+
+const std::vector<at::IValue>& RRefFetchRet::values() {
+  return values_;
+}
+
+Message RRefFetchRet::toMessage() && {
+  std::vector<at::IValue> ivalues = values_;
+  std::vector<torch::Tensor> tensor_table;
+  auto payload =
+      jit::pickle(c10::ivalue::Tuple::create(ivalues), &tensor_table);
+
+  return Message(
+      std::move(payload), std::move(tensor_table), MessageType::RREF_FETCH_RET);
+}
+
+std::unique_ptr<RRefFetchRet> RRefFetchRet::fromMessage(
+    const Message& message) {
+  auto payload = static_cast<const char*>(message.payload().data());
+  auto payload_size = message.payload().size();
+
+  auto value =
+      jit::unpickle(payload, payload_size, nullptr, &message.tensors());
+  auto values = value.toTuple()->elements();
+
+  return c10::guts::make_unique<RRefFetchRet>(std::move(values));
+}
+
+std::unique_ptr<RRefUserDelete> RRefUserDelete::fromMessage(
+    const Message& message) {
+  auto pair =
+      ForkMessageBase::fromMessage(message, MessageType::RREF_USER_DELETE);
+  return c10::guts::make_unique<RRefUserDelete>(
+      RRefUserDelete(pair.first, pair.second));
+}
+
+std::unique_ptr<RemoteRet> RemoteRet::fromMessage(const Message& message) {
+  auto pair = ForkMessageBase::fromMessage(message, MessageType::REMOTE_RET);
+  return c10::guts::make_unique<RemoteRet>(pair.first, pair.second);
+}
+
+const ForkId& RRefChildAccept::forkId() const {
+  return forkId_;
+}
+
+Message RRefChildAccept::toMessage() && {
+  return fromIValues({forkId_.toIValue()}, MessageType::RREF_CHILD_ACCEPT);
+}
+
+std::unique_ptr<RRefChildAccept> RRefChildAccept::fromMessage(
+    const Message& message) {
+  auto values = toIValues(message, MessageType::RREF_CHILD_ACCEPT);
+  TORCH_INTERNAL_ASSERT(values.size() == 1, "Expect 1 IValues from message.");
+
+  return c10::guts::make_unique<RRefChildAccept>(
+      ForkId::fromIValue(values.back()));
+}
+
+std::unique_ptr<RRefForkRequest> RRefForkRequest::fromMessage(
+    const Message& message) {
+  auto pair =
+      ForkMessageBase::fromMessage(message, MessageType::RREF_FORK_REQUEST);
+  return c10::guts::make_unique<RRefForkRequest>(pair.first, pair.second);
+}
+
+Message RRefAck::toMessage() && {
+  return Message({}, {}, MessageType::RREF_ACK);
+}
+
+std::unique_ptr<RRefAck> RRefAck::fromMessage(const Message& message) {
+  TORCH_INTERNAL_ASSERT(
+      message.type() == MessageType::RREF_ACK,
+      "Message type miss match, expect ",
+      MessageType::RREF_ACK,
+      ", but got ",
+      message.type());
+  return c10::guts::make_unique<RRefAck>();
+}
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/rref_proto.h b/torch/csrc/distributed/rpc/rref_proto.h
new file mode 100644
index 0000000000000..0a3c9ccda3191
--- /dev/null
+++ b/torch/csrc/distributed/rpc/rref_proto.h
@@ -0,0 +1,137 @@
+#pragma once
+
+#include <torch/csrc/distributed/rpc/message.h>
+#include <torch/csrc/distributed/rpc/rpc_command_base.h>
+#include <torch/csrc/distributed/rpc/types.h>
+#include <torch/csrc/jit/operator.h>
+#include <torch/csrc/jit/pickler.h>
+#include <vector>
+
+namespace torch {
+namespace distributed {
+namespace rpc {
+
+// Temporary solution of RRef operations.
+// TODO: Remove all these messages and use rpc + registered functions instead.
+class TORCH_API RRefMessageBase : public RpcCommandBase {
+ public:
+  RRefMessageBase(const RRefId& rrefId, MessageType type)
+      : rrefId_(rrefId), type_(type) {}
+
+  virtual ~RRefMessageBase() override = default;
+
+  const RRefId& rrefId();
+
+  virtual Message toMessage() && override;
+  static at::IValue fromMessage(const Message& message, MessageType type);
+
+ protected:
+  const RRefId rrefId_;
+  const MessageType type_;
+};
+
+class TORCH_API ForkMessageBase : public RRefMessageBase {
+ public:
+  ForkMessageBase(const RRefId& rrefId, const ForkId& forkId, MessageType type)
+      : RRefMessageBase(rrefId, type), forkId_(forkId) {}
+
+  virtual ~ForkMessageBase() override = default;
+
+  const ForkId& forkId();
+
+  virtual Message toMessage() && override;
+  static std::pair<RRefId, ForkId> fromMessage(
+      const Message& message,
+      MessageType type);
+
+ protected:
+  const ForkId forkId_;
+};
+
+// UserRRef uses this message to fetch the remote RRef value from the owner.
+class TORCH_API ScriptRRefFetchCall final : public RRefMessageBase {
+ public:
+  explicit ScriptRRefFetchCall(const RRefId& rrefId)
+      : RRefMessageBase(rrefId, MessageType::SCRIPT_RREF_FETCH_CALL) {}
+
+  static std::unique_ptr<ScriptRRefFetchCall> fromMessage(
+      const Message& message);
+};
+
+class TORCH_API PythonRRefFetchCall final : public RRefMessageBase {
+ public:
+  explicit PythonRRefFetchCall(const RRefId& rrefId)
+      : RRefMessageBase(rrefId, MessageType::PYTHON_RREF_FETCH_CALL) {}
+
+  static std::unique_ptr<PythonRRefFetchCall> fromMessage(
+      const Message& message);
+};
+
+// OwnerRRef uses this message to send the RRef value to a remote UserRRef
+class TORCH_API RRefFetchRet final : public RpcCommandBase {
+ public:
+  explicit RRefFetchRet(std::vector<at::IValue> values)
+      : values_(std::move(values)) {}
+
+  const std::vector<at::IValue>& values();
+
+  Message toMessage() && override;
+  static std::unique_ptr<RRefFetchRet> fromMessage(const Message& message);
+
+ private:
+  std::vector<at::IValue> values_;
+};
+
+// UserRRef (regardless it's the creator or not) uses this message to notiify
+// OwnerRRef on delete.
+class TORCH_API RRefUserDelete final : public ForkMessageBase {
+ public:
+  RRefUserDelete(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::RREF_USER_DELETE) {}
+
+  static std::unique_ptr<RRefUserDelete> fromMessage(const Message& message);
+};
+
+class TORCH_API RemoteRet final : public ForkMessageBase {
+ public:
+  RemoteRet(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::REMOTE_RET) {}
+
+  static std::unique_ptr<RemoteRet> fromMessage(const Message& message);
+};
+
+// A child RRef uses this message to notify its parent that the child has been
+// confirmed by the owner.
+class TORCH_API RRefChildAccept final : public RpcCommandBase {
+ public:
+  explicit RRefChildAccept(const ForkId& forkId) : forkId_(forkId) {}
+
+  const ForkId& forkId() const;
+
+  Message toMessage() && override;
+  static std::unique_ptr<RRefChildAccept> fromMessage(const Message& message);
+
+ private:
+  const ForkId forkId_;
+};
+
+// A child RRef uses this message to send a fork request to the owner.
+class TORCH_API RRefForkRequest final : public ForkMessageBase {
+ public:
+  RRefForkRequest(const RRefId& rrefId, const ForkId& forkId)
+      : ForkMessageBase(rrefId, forkId, MessageType::RREF_FORK_REQUEST) {}
+
+  static std::unique_ptr<RRefForkRequest> fromMessage(const Message& message);
+};
+
+class TORCH_API RRefAck final : public RpcCommandBase {
+ public:
+  RRefAck() {}
+
+  Message toMessage() && override;
+  static std::unique_ptr<RRefAck> fromMessage(const Message& message);
+};
+
+} // namespace rpc
+} // namespace distributed
+} // namespace torch
diff --git a/torch/csrc/distributed/rpc/script_remote_call.cpp b/torch/csrc/distributed/rpc/script_remote_call.cpp
index a8d8a7a19dc42..2af16642256c9 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.cpp
+++ b/torch/csrc/distributed/rpc/script_remote_call.cpp
@@ -9,32 +9,26 @@ namespace rpc {
 ScriptRemoteCall::ScriptRemoteCall(
     std::shared_ptr<Operator> op,
     std::vector<at::IValue>&& args,
-    at::IValue retRRefId,
-    at::IValue retForkId)
+    const RRefId& retRRefId,
+    const ForkId& retForkId)
     : ScriptCall(std::move(op), std::move(args)),
-      retRRefId_(std::move(retRRefId)),
-      retForkId_(std::move(retForkId)) {}
-
-const at::IValue& ScriptRemoteCall::retRRefId() {
-  return retRRefId_;
-}
-
-const at::IValue& ScriptRemoteCall::retForkId() {
-  return retForkId_;
-}
+      retRRefId_(retRRefId),
+      retForkId_(retForkId) {}
 
 Message ScriptRemoteCall::toMessage() && {
   std::vector<IValue> ivalues;
   ScriptCall::toIValues(ivalues);
-  ivalues.push_back(retRRefId_);
-  ivalues.push_back(retForkId_);
+  ivalues.emplace_back(retRRefId_.toIValue());
+  ivalues.emplace_back(retForkId_.toIValue());
 
   std::vector<torch::Tensor> tensor_table;
-  auto payload =
-      jit::pickle(c10::ivalue::Tuple::create(ivalues), &tensor_table);
+  auto payload = jit::pickle(
+      c10::ivalue::Tuple::create(std::move(ivalues)), &tensor_table);
 
   return Message(
-      std::move(payload), std::move(tensor_table), MessageType::REMOTE_CALL);
+      std::move(payload),
+      std::move(tensor_table),
+      MessageType::SCRIPT_REMOTE_CALL);
 }
 
 std::unique_ptr<ScriptRemoteCall> ScriptRemoteCall::fromMessage(
@@ -47,9 +41,9 @@ std::unique_ptr<ScriptRemoteCall> ScriptRemoteCall::fromMessage(
   auto values = value.toTuple()->elements();
 
   // remove the last element from values and convert it back to an RRef
-  auto retForkId = std::move(values.back());
+  auto retForkId = RRefId::fromIValue(values.back());
   values.pop_back();
-  auto retRRefId = std::move(values.back());
+  auto retRRefId = ForkId::fromIValue(values.back());
   values.pop_back();
 
   auto op = ScriptCall::fromIValues(values);
diff --git a/torch/csrc/distributed/rpc/script_remote_call.h b/torch/csrc/distributed/rpc/script_remote_call.h
index e0afb9497ee44..46bd957bbbaa3 100644
--- a/torch/csrc/distributed/rpc/script_remote_call.h
+++ b/torch/csrc/distributed/rpc/script_remote_call.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <torch/csrc/distributed/rpc/script_call.h>
+#include <torch/csrc/distributed/rpc/types.h>
 #include <torch/csrc/jit/operator.h>
 #include <torch/csrc/jit/pickler.h>
 #include <vector>
@@ -11,26 +12,32 @@ namespace rpc {
 
 using torch::jit::Operator;
 
-// A ScriptCall instance represents an invocation of a builtin operator for a
-// TorchScript function (not implemented yet). If it is a builtin operator, it
-// contains a shared ptr to the `Operator` and a list of arguments.
+// A ScriptRemoteCall instance represents an invocation of `dist.remote` on a
+// builtin operator. Currently, it does not support using RRef as arguments yet.
+// Besides the operator and a vector of arguments, ScriptRemoteCall also
+// caontains the RRefId and the ForkId of the return value RRef.
 class TORCH_API ScriptRemoteCall final : public ScriptCall {
  public:
   ScriptRemoteCall(
       std::shared_ptr<Operator> op,
       std::vector<at::IValue>&& args,
-      at::IValue retRRefId,
-      at::IValue retForkId);
+      const RRefId& retRRefId,
+      const ForkId& retForkId);
 
-  const at::IValue& retRRefId();
-  const at::IValue& retForkId();
+  inline const RRefId& retRRefId() const {
+    return retRRefId_;
+  }
+
+  inline const ForkId& retForkId() const {
+    return retForkId_;
+  }
 
   Message toMessage() && override;
   static std::unique_ptr<ScriptRemoteCall> fromMessage(const Message& message);
 
  private:
-  const at::IValue retRRefId_;
-  const at::IValue retForkId_;
+  const RRefId retRRefId_;
+  const ForkId retForkId_;
 };
 
 } // namespace rpc
diff --git a/torch/csrc/distributed/rpc/types.cpp b/torch/csrc/distributed/rpc/types.cpp
index 2071a6a59ae3c..1d106301bd001 100644
--- a/torch/csrc/distributed/rpc/types.cpp
+++ b/torch/csrc/distributed/rpc/types.cpp
@@ -4,6 +4,17 @@ namespace torch {
 namespace distributed {
 namespace rpc {
 
+static_assert(
+    std::numeric_limits<local_id_t>::max() <=
+        std::numeric_limits<int64_t>::max(),
+    "The max value of local_id_t must be within the range of int64_t");
+static_assert(
+    std::numeric_limits<worker_id_t>::max() <=
+        std::numeric_limits<int64_t>::max(),
+    "The max value of worker_id_t must be within the range of int64_t");
+
+///////////////////////////  GloballyUniqueId   ///////////////////////////
+
 GloballyUniqueId::GloballyUniqueId(worker_id_t createdOn, local_id_t localId)
     : createdOn_(createdOn), localId_(localId) {}
 
@@ -16,8 +27,8 @@ bool GloballyUniqueId::operator!=(const GloballyUniqueId& other) const {
 }
 
 at::IValue GloballyUniqueId::toIValue() const {
-  std::vector<at::IValue> ivalues = {(int64_t)createdOn_, (int64_t)localId_};
-  return c10::ivalue::Tuple::create(std::move(ivalues));
+  return c10::ivalue::Tuple::create(
+      {static_cast<int64_t>(createdOn_), static_cast<int64_t>(localId_)});
 }
 
 GloballyUniqueId GloballyUniqueId::fromIValue(const at::IValue& ivalue) {
@@ -28,18 +39,17 @@ GloballyUniqueId GloballyUniqueId::fromIValue(const at::IValue& ivalue) {
       "expects a GenericList of two elements, but got ",
       ivalues.size());
 
-  worker_id_t createdOn = ivalues[0].toInt();
-  local_id_t localId = ivalues[1].toInt();
-
   TORCH_CHECK(
-      createdOn < std::numeric_limits<worker_id_t>::max(),
+      ivalues[0].toInt() <= std::numeric_limits<worker_id_t>::max(),
       "GloballyUniqueId createdOn out of range, got ",
-      createdOn);
+      ivalues[0].toInt());
+  worker_id_t createdOn = ivalues[0].toInt();
 
   TORCH_CHECK(
-      localId < std::numeric_limits<local_id_t>::max(),
+      ivalues[1].toInt() <= std::numeric_limits<local_id_t>::max(),
       "GloballyUniqueId localId out of range, got ",
-      localId);
+      ivalues[1].toInt());
+  local_id_t localId = ivalues[1].toInt();
 
   return GloballyUniqueId(createdOn, localId);
 }
@@ -49,6 +59,29 @@ std::ostream& operator<<(std::ostream& os, GloballyUniqueId const& globalId) {
             << globalId.localId_ << ")";
 }
 
+///////////////////////////  SerializedPyObj   ///////////////////////////
+
+std::vector<at::IValue> SerializedPyObj::toIValues() const {
+  std::vector<at::IValue> ivalues;
+  ivalues.reserve(tensors_.size() + 1);
+  for (auto& tensor : tensors_) {
+    ivalues.emplace_back(tensor);
+  }
+  ivalues.emplace_back(payload_);
+  return ivalues;
+}
+
+SerializedPyObj SerializedPyObj::fromIValues(std::vector<at::IValue> values) {
+  std::string payload = values.back().toStringRef();
+  values.pop_back();
+  std::vector<at::Tensor> tensors;
+  tensors.reserve(values.size());
+  for (auto& value : values) {
+    tensors.emplace_back(value.toTensor());
+  }
+  return SerializedPyObj(std::move(payload), std::move(tensors));
+}
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/types.h b/torch/csrc/distributed/rpc/types.h
index 47de3d09f205a..78b869df261c3 100644
--- a/torch/csrc/distributed/rpc/types.h
+++ b/torch/csrc/distributed/rpc/types.h
@@ -8,11 +8,12 @@ namespace distributed {
 namespace rpc {
 
 using worker_id_t = int16_t;
-using local_id_t = uint64_t;
+using local_id_t = int64_t;
 
-struct GloballyUniqueId final {
+struct TORCH_API GloballyUniqueId final {
   GloballyUniqueId(worker_id_t createdOn, local_id_t localId);
   GloballyUniqueId(const GloballyUniqueId& other) = default;
+  GloballyUniqueId& operator=(const GloballyUniqueId& other) = delete;
 
   bool operator==(const GloballyUniqueId& other) const;
   bool operator!=(const GloballyUniqueId& other) const;
@@ -32,11 +33,24 @@ struct GloballyUniqueId final {
   const local_id_t localId_;
 };
 
-std::ostream& operator<<(std::ostream& os, const GloballyUniqueId& globalId);
+TORCH_API std::ostream& operator<<(
+    std::ostream& os,
+    const GloballyUniqueId& globalId);
 
 using RRefId = GloballyUniqueId;
 using ForkId = GloballyUniqueId;
 
+struct TORCH_API SerializedPyObj final {
+  SerializedPyObj(std::string&& payload, std::vector<at::Tensor>&& tensors)
+      : payload_(std::move(payload)), tensors_(std::move(tensors)) {}
+
+  std::vector<at::IValue> toIValues() const;
+  static SerializedPyObj fromIValues(std::vector<at::IValue> value);
+
+  const std::string payload_;
+  const std::vector<at::Tensor> tensors_;
+};
+
 } // namespace rpc
 } // namespace distributed
 } // namespace torch
diff --git a/torch/csrc/distributed/rpc/utils.cpp b/torch/csrc/distributed/rpc/utils.cpp
index 0bf80593b2fd9..def48925cc8db 100644
--- a/torch/csrc/distributed/rpc/utils.cpp
+++ b/torch/csrc/distributed/rpc/utils.cpp
@@ -1,11 +1,12 @@
 #include <torch/csrc/distributed/rpc/utils.h>
+#include <torch/csrc/distributed/rpc/python_remote_call.h>
 #include <torch/csrc/distributed/rpc/python_udf_call.h>
 #include <torch/csrc/distributed/rpc/python_udf_resp.h>
 #include <torch/csrc/distributed/rpc/rpc_with_autograd.h>
+#include <torch/csrc/distributed/rpc/rref_proto.h>
 #include <torch/csrc/distributed/rpc/script_call.h>
 #include <torch/csrc/distributed/rpc/script_remote_call.h>
 #include <torch/csrc/distributed/rpc/script_resp.h>
-#include <torch/csrc/distributed/rpc/script_rref_proto.h>
 
 namespace torch {
 namespace distributed {
@@ -19,17 +20,26 @@ std::unique_ptr<RpcCommandBase> deserializeRequest(const Message& request) {
     case MessageType::PYTHON_CALL: {
       return PythonUDFCall::fromMessage(request);
     }
-    case MessageType::REMOTE_CALL: {
+    case MessageType::SCRIPT_REMOTE_CALL: {
       return ScriptRemoteCall::fromMessage(request);
     }
-    case MessageType::RREF_FETCH_CALL: {
+    case MessageType::PYTHON_REMOTE_CALL: {
+      return PythonRemoteCall::fromMessage(request);
+    }
+    case MessageType::SCRIPT_RREF_FETCH_CALL: {
       return ScriptRRefFetchCall::fromMessage(request);
     }
-    case MessageType::RREF_USER_CREATE: {
-      return ScriptRRefCreate::fromMessage(request);
+    case MessageType::PYTHON_RREF_FETCH_CALL: {
+      return PythonRRefFetchCall::fromMessage(request);
     }
     case MessageType::RREF_USER_DELETE: {
-      return ScriptRRefDelete::fromMessage(request);
+      return RRefUserDelete::fromMessage(request);
+    }
+    case MessageType::RREF_CHILD_ACCEPT: {
+      return RRefChildAccept::fromMessage(request);
+    }
+    case MessageType::RREF_FORK_REQUEST: {
+      return RRefForkRequest::fromMessage(request);
     }
     case MessageType::MESSAGE_WITH_AUTOGRAD_REQ: {
       return RpcWithAutograd::fromMessage(request);
@@ -49,6 +59,15 @@ std::unique_ptr<RpcCommandBase> deserializeResponse(const Message& response) {
     case MessageType::PYTHON_RET: {
       return PythonUDFResp::fromMessage(response);
     }
+    case MessageType::REMOTE_RET: {
+      return RemoteRet::fromMessage(response);
+    }
+    case MessageType::RREF_FETCH_RET: {
+      return RRefFetchRet::fromMessage(response);
+    }
+    case MessageType::RREF_ACK: {
+      return RRefAck::fromMessage(response);
+    }
     case MessageType::EXCEPTION: {
       std::string err(response.payload().begin(), response.payload().end());
       throw std::runtime_error(err);
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index 987b529cdee15..10f745b6b13cc 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -54,4 +54,4 @@ def init_model_parallel(self_name,
             """
             _init_rpc(backend, self_name, self_rank, init_method, num_send_recv_threads)
             from .rpc_api import _agent
-            autograd._init(_agent.get_worker_id().id)
+            autograd._init(_agent.get_worker_info().id)
diff --git a/torch/distributed/internal_rpc_utils.py b/torch/distributed/internal_rpc_utils.py
index 6475366e471b0..2734294b875f9 100644
--- a/torch/distributed/internal_rpc_utils.py
+++ b/torch/distributed/internal_rpc_utils.py
@@ -100,6 +100,8 @@ def deserialize(self, binary_data, tensor_table):
 # Create _internal_rpc_pickler only once to initialize _dispatch_table only once
 _internal_rpc_pickler = _InternalRPCPickler()
 
+def serialize(obj):
+    return _internal_rpc_pickler.serialize(obj)
 
 def run_python_udf_internal(pickled_python_udf, tensors):
     r"""
@@ -114,7 +116,8 @@ def run_python_udf_internal(pickled_python_udf, tensors):
         # except str = exception info + traceback string
         except_str = "{}\n{}".format(repr(e), traceback.format_exc())
         result = RemoteException(except_str)
-    return _internal_rpc_pickler.serialize(result)
+    # return _internal_rpc_pickler.serialize(result)
+    return result
 
 
 def load_python_udf_result_internal(pickled_python_result, tensors):
diff --git a/torch/distributed/rpc_api.py b/torch/distributed/rpc_api.py
index 8b033229a76fb..6c391490ae074 100644
--- a/torch/distributed/rpc_api.py
+++ b/torch/distributed/rpc_api.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python3
 
-from . import invoke_rpc_builtin, invoke_rpc_python_udf, invoke_remote_builtin
-from . import init_rref_context
+from . import invoke_rpc_builtin, invoke_rpc_python_udf
+from . import invoke_remote_builtin, invoke_remote_python_udf
+from . import _init_rref_context, _destroy_rref_context
 from . import ProcessGroupAgent
-from . import WorkerId
-from .rpc_backend_registry import is_rpc_backend_registered, init_rpc_backend
+from . import WorkerInfo
 from .internal_rpc_utils import _internal_rpc_pickler, PythonUDF
+from .rpc_backend_registry import is_rpc_backend_registered, init_rpc_backend
 
 import functools
 import sys
@@ -38,6 +39,7 @@ def join_rpc():
     if _agent:
         _agent.join()
         _agent = None
+        _destroy_rref_context()
 
 
 @_require_initialized
@@ -78,40 +80,41 @@ def _init_rpc(backend=RpcBackend.PROCESS_GROUP,
                                self_rank, group.rank()))
         # TODO: add try-except and destroy _agent in all processes if any fails.
         _agent = ProcessGroupAgent(self_name, group, num_send_recv_threads)
-        init_rref_context(_agent)
+        _init_rref_context(_agent)
     elif is_rpc_backend_registered(backend):
         _agent = init_rpc_backend(
             backend,
             self_rank=self_rank,
             self_name=self_name,
-            init_method=init_method,
+            init_method=init_method
         )
-        init_rref_context(_agent)
+        _init_rref_context(_agent)
     else:
         raise RuntimeError("Unrecognized RPC backend ", backend)
 
 
 @_require_initialized
-def get_worker_id(worker_name=None):
+def get_worker_info(worker_name=None):
     r"""
-    Get worker id of a given worker name. Use this worker id to avoid passing
-    an expensive string to ``rpc`` on every invocation.
+    Get WorkerInfo of a given worker name. Use this WorkerInfo to avoid passing
+    an expensive string to ``rpc`` on every invocation. The WorkerInfo contains
+    the name of the worker and the id of the worker.
 
     Arguments:
         worker_name (str): the string name of a worker. If ``None``, return the
                            the id of the current worker. (default ``None``)
     """
     if worker_name:
-        return _agent.get_worker_id(worker_name)
+        return _agent.get_worker_info(worker_name)
     else:
-        return _agent.get_worker_id()
+        return _agent.get_worker_info()
 
 
-def _to_worker_id(name_or_id):
-    if isinstance(name_or_id, WorkerId):
+def _to_worker_info(name_or_id):
+    if isinstance(name_or_id, WorkerInfo):
         return name_or_id
     elif isinstance(name_or_id, str):
-        return get_worker_id(name_or_id)
+        return get_worker_info(name_or_id)
     else:
         raise ValueError("Unsupported RPC worker ID type {}".format(name_or_id))
 
@@ -142,7 +145,7 @@ def remote(to, func, args=None, kwargs=None):
         >>> import torch.distributed as dist
         >>> dist.init_process_group(backend='gloo', rank=0, world_size=2)
         >>> dist.init_rpc("worker0")
-        >>> worker1 = dist.get_worker_id("worker1")
+        >>> worker1 = dist.get_worker_info("worker1")
         >>> rref1 = dist.remote(worker1, torch.add, args=(torch.ones(2), 3))
         >>> rref2 = dist.remote(worker1, torch.add, args=(torch.ones(2), 1))
         >>> x = rref1.to_here() + rref2.to_here()
@@ -159,8 +162,15 @@ def remote(to, func, args=None, kwargs=None):
     args = args if args else ()
     kwargs = kwargs if kwargs else {}
 
-    return invoke_remote_builtin(
-        _agent, _to_worker_id(to), qualified_name, *args, **kwargs)
+    info = _to_worker_info(to)
+    if qualified_name is not None:
+        return invoke_remote_builtin(
+            _agent, info, qualified_name, *args, **kwargs)
+    else:
+        (pickled_python_udf, tensors) = _internal_rpc_pickler.serialize(
+            PythonUDF(func, args, kwargs))
+        return invoke_remote_python_udf(
+            _agent, info, pickled_python_udf, tensors)
 
 
 def _invoke_rpc(to, func, args=None, kwargs=None):
@@ -172,15 +182,16 @@ def _invoke_rpc(to, func, args=None, kwargs=None):
     args = args if args else ()
     kwargs = kwargs if kwargs else {}
 
+    info = _to_worker_info(to)
     if qualified_name is not None:
         fut = invoke_rpc_builtin(
-            _agent, _to_worker_id(to), qualified_name, *args, **kwargs
+            _agent, info, qualified_name, *args, **kwargs
         )
     else:
         (pickled_python_udf, tensors) = _internal_rpc_pickler.serialize(
             PythonUDF(func, args, kwargs))
         fut = invoke_rpc_python_udf(
-            _agent, _to_worker_id(to), pickled_python_udf, tensors)
+            _agent, info, pickled_python_udf, tensors)
     return fut
 
 
@@ -314,7 +325,7 @@ def rpc(to, func, args=None, kwargs=None, async_call=False):
         >>> import torch.distributed as dist
         >>> dist.init_process_group(backend='gloo', rank=0, world_size=2)
         >>> dist.init_model_parallel("worker0")
-        >>> worker1 = dist.get_worker_id("worker1")
+        >>> worker1 = dist.get_worker_info("worker1")
         >>> fut1 = dist.rpc(worker1, torch.add, args=(torch.ones(2), 3), async_call=True)
         >>> fut2 = dist.rpc(worker1, min, args=(1, 2), async_call=True)
         >>> result = fut1.wait() + fut2.wait()
@@ -330,6 +341,7 @@ def rpc(to, func, args=None, kwargs=None, async_call=False):
         """dist.rpc is deprecated. Use dist.rpc_async for asynchronous
     calls or dist.rpc_sync for synchronous calls instead."""
     )
+
     if async_call:
         return rpc_async(to, func, args, kwargs)
     else:

From f5df46ce397335c43cb15457e3828a1a66cbbfa9 Mon Sep 17 00:00:00 2001
From: Jeremy Lilley <jeremyl@fb.com>
Date: Thu, 3 Oct 2019 17:57:57 -0700
Subject: [PATCH 036/113] Set MINIZ_NO_TIME to avoid computing localtime on
 each pickle/unpickle (#27268)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27268

For small pickle/unpickle, we spend a disproportionate amount of time in
time functions - roughly 23% in __tzset() for unpickle case.

We're currently not using the .m_time currently, though we can add this feature
back if it's ever needed.

An alternative would be to -DMINIZ_NO_TIME in compiler_flags, but we would
need to also consistently # define MINIZ_NO_TIME in any .cpp including this .h,
since this # define modifies the struct length in an unfortunate manner.

Test Plan:
buck test mode/dev-nosan caffe2/test/...
Run benchmark:
 buck-out/opt/gen/caffe2/torch/fb/distributed/thriftRpcBackend/test/ThriftRpcAgentBench

Differential Revision: D17724198

fbshipit-source-id: b44a0217b1d9f8ce6c0f24297f59045c7cadf4b1
---
 third_party/miniz-2.0.8/miniz.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/miniz-2.0.8/miniz.h b/third_party/miniz-2.0.8/miniz.h
index 751c0f3754aa2..67b533079f255 100755
--- a/third_party/miniz-2.0.8/miniz.h
+++ b/third_party/miniz-2.0.8/miniz.h
@@ -125,7 +125,7 @@
 /* If MINIZ_NO_TIME is specified then the ZIP archive functions will not be able to get the current time, or */
 /* get/set file times, and the C run-time funcs that get/set times won't be called. */
 /* The current downside is the times written to your archives will be from 1979. */
-/*#define MINIZ_NO_TIME */
+#define MINIZ_NO_TIME
 
 /* Define MINIZ_NO_ARCHIVE_APIS to disable all ZIP archive API's. */
 /*#define MINIZ_NO_ARCHIVE_APIS */

From 86a8971ebb07dfb2c21fc1f4d80389c8e952e7e5 Mon Sep 17 00:00:00 2001
From: Shihao Xu <shihaoxu@fb.com>
Date: Thu, 3 Oct 2019 18:58:10 -0700
Subject: [PATCH 037/113] Add a test case to RpcTest, check src/dst (#27322)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27322

# Problem

Existing test cases are too symmetric, so that didn't detect this error, request sent to the wrong worker.

Because of wrong `worker_names` setup, worker0 sends request to itself, while it should had sent to worker1.

# Solution

Add a test case, letting the dst side to check if it's an request from the expected src.
ghstack-source-id: 91299312

Reviewed By: satgera

Differential Revision: D17069062

fbshipit-source-id: ef7a532dd497bfc0f0ee8446fcd5d29656aaf175
---
 test/rpc_test.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/test/rpc_test.py b/test/rpc_test.py
index f7cefdcb62270..081e644a2616a 100644
--- a/test/rpc_test.py
+++ b/test/rpc_test.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import concurrent.futures
 import sys
 import unittest
 from os import getenv
@@ -29,10 +30,17 @@ def requires_process_group_agent(func):
 RPC_INIT_URL = getenv("RPC_INIT_URL", "")
 
 
+VALUE_FUTURE = concurrent.futures.Future()
+
+
 def stub_init_rpc_backend_handler(self_rank, self_name, init_method):
     return mock.Mock()  # RpcAgent.
 
 
+def set_value(value):
+    VALUE_FUTURE.set_result(value)
+
+
 # it is used to test python user defined function over rpc
 # classes and functions are used to test python user defined class and
 # methods over rpc
@@ -423,6 +431,14 @@ def test_join_rpc(self):
         # it's safe to call join_rpc() multiple times
         dist.join_rpc()
 
+    @dist_init
+    def test_expected_src(self):
+        dst_rank = (self.rank + 1) % self.world_size
+        expected_src_rank = (self.rank - 1) % self.world_size
+        ret = dist.rpc_sync("worker{}".format(dst_rank), set_value, args=(self.rank,))
+        value = VALUE_FUTURE.result()
+        self.assertEqual(value, expected_src_rank)
+
     @dist_init
     def test_py_built_in(self):
         n = self.rank + 1

From c3d97c2638b2cf790efba8ce939e600e9b772faa Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Thu, 3 Oct 2019 20:02:04 -0700
Subject: [PATCH 038/113] Update to ROCm 2.8 (#27337)

Summary:
New docker images built with tag 324.

Related jenkins changes:
https://github.com/pytorch/ossci-job-dsl/commit/83ec81335742e66b02af90b7c74021b8792fc63f
https://github.com/pytorch/ossci-job-dsl/commit/aa235a14c82db69d0544cd8fc1da03ef9a50096e

Triggered CI runs:
https://ci.pytorch.org/jenkins/job/caffe2-builds/job/py2-devtoolset7-rocmrpm-centos7.5-trigger-test/48682/
https://ci.pytorch.org/jenkins/job/pytorch-builds/job/py2-clang7-rocmdeb-ubuntu16.04-trigger/55638/
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27337

Differential Revision: D17753827

Pulled By: bddppq

fbshipit-source-id: 2c3f77b0b7c680013c7cc6d7953fe0da4922fe48
---
 .../cimodel/data/caffe2_build_definitions.py  |  2 +-
 .circleci/config.yml                          | 40 +++++++++----------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.circleci/cimodel/data/caffe2_build_definitions.py b/.circleci/cimodel/data/caffe2_build_definitions.py
index 6a712002e28dc..07c3a24c48921 100644
--- a/.circleci/cimodel/data/caffe2_build_definitions.py
+++ b/.circleci/cimodel/data/caffe2_build_definitions.py
@@ -14,7 +14,7 @@
 
 DOCKER_IMAGE_PATH_BASE = "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/"
 
-DOCKER_IMAGE_VERSION = 315
+DOCKER_IMAGE_VERSION = 324
 
 
 @dataclass
diff --git a/.circleci/config.yml b/.circleci/config.yml
index ddb1cd5401701..4304000f9b515 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1934,7 +1934,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-gcc4.8-ubuntu14.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:324"
       - caffe2_linux_test:
           name: caffe2_py2_gcc4_8_ubuntu14_04_test
           requires:
@@ -1946,7 +1946,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-gcc4.8-ubuntu14.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:324"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
@@ -1958,7 +1958,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
       - caffe2_linux_test:
           name: caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test
           requires:
@@ -1971,14 +1971,14 @@ workflows:
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
           resource_class: gpu.medium
       - caffe2_linux_build:
           name: caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
       - caffe2_linux_test:
           name: caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_test
           requires:
@@ -1986,14 +1986,14 @@ workflows:
             - caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build
           build_environment: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
           resource_class: gpu.medium
       - caffe2_linux_build:
           name: caffe2_py3_5_cuda10_1_cudnn7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py3.5-cuda10.1-cudnn7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:324"
       - caffe2_linux_test:
           name: caffe2_py3_5_cuda10_1_cudnn7_ubuntu16_04_test
           requires:
@@ -2001,35 +2001,35 @@ workflows:
             - caffe2_py3_5_cuda10_1_cudnn7_ubuntu16_04_build
           build_environment: "caffe2-py3.5-cuda10.1-cudnn7-ubuntu16.04-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:324"
           resource_class: gpu.medium
       - caffe2_linux_build:
           name: caffe2_py2_mkl_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py2-mkl-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:324"
       - caffe2_linux_test:
           name: caffe2_py2_mkl_ubuntu16_04_test
           requires:
             - setup
             - caffe2_py2_mkl_ubuntu16_04_build
           build_environment: "caffe2-py2-mkl-ubuntu16.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:324"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_onnx_py2_gcc5_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-onnx-py2-gcc5-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:324"
       - caffe2_linux_test:
           name: caffe2_onnx_py2_gcc5_ubuntu16_04_test
           requires:
             - setup
             - caffe2_onnx_py2_gcc5_ubuntu16_04_build
           build_environment: "caffe2-onnx-py2-gcc5-ubuntu16.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:324"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_py2_clang3_8_ubuntu16_04_build
@@ -2041,7 +2041,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-clang3.8-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:324"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_py2_clang3_9_ubuntu16_04_build
@@ -2053,35 +2053,35 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-clang3.9-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:324"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_py2_clang7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py2-clang7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:324"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_onnx_py3_6_clang7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-onnx-py3.6-clang7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:324"
       - caffe2_linux_test:
           name: caffe2_onnx_py3_6_clang7_ubuntu16_04_test
           requires:
             - setup
             - caffe2_onnx_py3_6_clang7_ubuntu16_04_build
           build_environment: "caffe2-onnx-py3.6-clang7-ubuntu16.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:324"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_py2_android_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py2-android-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:324"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_py2_cuda9_0_cudnn7_centos7_build
@@ -2093,7 +2093,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-centos7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:324"
       - caffe2_linux_test:
           name: caffe2_py2_cuda9_0_cudnn7_centos7_test
           requires:
@@ -2106,7 +2106,7 @@ workflows:
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-centos7-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:315"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:324"
           resource_class: gpu.medium
       - caffe2_macos_build:
           name: caffe2_py2_ios_macos10_13_build

From 736c754739a6ea66e8f0344051d64bc10ff628ad Mon Sep 17 00:00:00 2001
From: Tao Xu <taox@fb.com>
Date: Thu, 3 Oct 2019 20:06:17 -0700
Subject: [PATCH 039/113] add sdk support for xcodebuild script

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27358

Test Plan: Imported from OSS

Differential Revision: D17757389

Pulled By: xta0

fbshipit-source-id: ed8e470b9c6329b96297ee7c65ba08759251baad
---
 .../{xcode_ios_x86_build.rb => xcode_build.rb}   | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)
 rename scripts/{xcode_ios_x86_build.rb => xcode_build.rb} (80%)

diff --git a/scripts/xcode_ios_x86_build.rb b/scripts/xcode_build.rb
similarity index 80%
rename from scripts/xcode_ios_x86_build.rb
rename to scripts/xcode_build.rb
index c786d9a17f743..20cb2d9df42af 100644
--- a/scripts/xcode_ios_x86_build.rb
+++ b/scripts/xcode_build.rb
@@ -10,6 +10,9 @@
  opts.on('-x', '--xcodeproj ', 'path to the XCode project file') { |value|
     options[:xcodeproj] = value
  }
+ opts.on('-p', '--platform ', 'iOS platform for the current build') { |value|
+    options[:platform] = value
+ }
 end.parse!
 puts options.inspect
 
@@ -36,6 +39,7 @@
     config.build_settings['HEADER_SEARCH_PATHS']    = header_search_path
     config.build_settings['LIBRARY_SEARCH_PATHS']   = libraries_search_path
     config.build_settings['OTHER_LINKER_FLAGS']     = other_linker_flags
+    config.build_settings['ENABLE_BITCODE']         = 'No'
 end
 
 # link static libraries
@@ -48,5 +52,15 @@
 end
 project.save
 
+sdk = nil
+if options[:platform] == 'SIMULATOR'
+    sdk = 'iphonesimulator'
+elsif options[:platform] == 'OS'
+    sdk = 'iphoneos'
+else
+    puts "unsupported platform #{options[:platform]}"
+    exit(false)
+end 
+
 # run xcodebuild
-exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk iphonesimulator -configuration Release"
+exec "xcodebuild clean build  -project #{xcodeproj_path}  -target #{target.name} -sdk #{sdk} -configuration Release"

From c874dd91a7063ea658978c4bc2aecabb6cd9bf4e Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Thu, 3 Oct 2019 20:12:35 -0700
Subject: [PATCH 040/113] export remainder (#24410)

Summary:
Added ONNX export support for torch.remainder and torch.fmod
Pull Request resolved: https://github.com/pytorch/pytorch/pull/24410

Reviewed By: hl475

Differential Revision: D17466791

Pulled By: houseroad

fbshipit-source-id: afe6519e5f370824e3b4a45b69036a7260fb72cf
---
 .../expect/TestOperators.test_fmod.expect     | 77 ++++++++++++++++
 .../TestOperators.test_remainder.expect       | 89 +++++++++++++++++++
 test/onnx/test_operators.py                   | 10 +++
 test/onnx/test_pytorch_onnx_caffe2.py         | 23 +++++
 test/onnx/test_pytorch_onnx_onnxruntime.py    | 36 ++++++++
 torch/onnx/symbolic_helper.py                 |  7 +-
 torch/onnx/symbolic_opset10.py                |  4 +
 torch/onnx/symbolic_opset9.py                 |  8 ++
 8 files changed, 253 insertions(+), 1 deletion(-)
 create mode 100644 test/onnx/expect/TestOperators.test_fmod.expect
 create mode 100644 test/onnx/expect/TestOperators.test_remainder.expect

diff --git a/test/onnx/expect/TestOperators.test_fmod.expect b/test/onnx/expect/TestOperators.test_fmod.expect
new file mode 100644
index 0000000000000..0fd8adb615cb9
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_fmod.expect
@@ -0,0 +1,77 @@
+ir_version: 4
+producer_name: "pytorch"
+producer_version: "1.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Mod"
+    attribute {
+      name: "fmod"
+      i: 1
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "2"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 10
+}
diff --git a/test/onnx/expect/TestOperators.test_remainder.expect b/test/onnx/expect/TestOperators.test_remainder.expect
new file mode 100644
index 0000000000000..fe6826b151f2c
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_remainder.expect
@@ -0,0 +1,89 @@
+ir_version: 4
+producer_name: "pytorch"
+producer_version: "1.3"
+graph {
+  node {
+    input: "0"
+    input: "1"
+    output: "2"
+    op_type: "Div"
+  }
+  node {
+    input: "2"
+    output: "3"
+    op_type: "Floor"
+  }
+  node {
+    input: "3"
+    input: "1"
+    output: "4"
+    op_type: "Mul"
+  }
+  node {
+    input: "0"
+    input: "4"
+    output: "5"
+    op_type: "Sub"
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  input {
+    name: "1"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 1
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "5"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 9
+}
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 4efb6d9b11a85..365ba55a4af7e 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -771,6 +771,16 @@ def test_frobenius_norm(self):
         x = torch.randn(2, 3, 4).float()
         self.assertONNX(lambda x: torch.norm(x, p="fro", dim=(0, 1), keepdim=True), x)
 
+    def test_remainder(self):
+        x = torch.randn(2, 3, 4)
+        y = torch.randn(2, 1, 4)
+        self.assertONNX(lambda x, y: torch.remainder(x, y), (x, y))
+
+    def test_fmod(self):
+        x = torch.randn(2, 3, 4)
+        y = torch.randn(2, 1, 4)
+        self.assertONNX(lambda x, y: torch.fmod(x, y), (x, y), opset_version=10)
+
     def test_gelu(self):
         x = torch.randn(2, 3, 4, 5, requires_grad=True)
         self.assertONNX(lambda x: torch.nn.functional.gelu(x), x)
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index db616f2659458..32ff6d97c56f6 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -2218,6 +2218,29 @@ def forward(self, x):
         x = torch.arange(16).view(2, 2, 4).to(torch.float32)
         self.run_model_test(MaskedFillModel2(), input=(x, ), train=False, batch_size=BATCH_SIZE)
 
+    def test_remainder(self):
+        class RemainderModel(torch.nn.Module):
+            def forward(self, input, other):
+                return torch.remainder(input, other)
+
+        x = torch.randn(4, 2, 3)
+        y = torch.randn(1, 2, 1)
+        model = RemainderModel()
+        outputs = model(x, y)
+        self.run_model_test(model, train=False, input=(x, y), batch_size=BATCH_SIZE,
+                            example_outputs=(outputs,))
+
+    def test_remainder_scalar(self):
+        class RemainderModel(torch.nn.Module):
+            def forward(self, input):
+                return torch.remainder(input, 2.55)
+
+        inputs = torch.randint(10, (2, 3))
+        model = RemainderModel()
+        outputs = model(inputs)
+        self.run_model_test(model, train=False, input=(inputs,), batch_size=BATCH_SIZE,
+                            example_outputs=(outputs,))
+
     def test_baddbmm(self):
         class MyModule(torch.nn.Module):
             def forward(self, input, batch1, batch2):
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index fc6335def9e9c..4648d1b06b543 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1107,6 +1107,42 @@ def forward(self, x):
         x = torch.randn(4, 2, 3, requires_grad=True)
         self.run_test(NormModel(), x)
 
+    def test_remainder(self):
+        class RemainderModel(torch.nn.Module):
+            def forward(self, input, other):
+                return torch.remainder(input, other)
+
+        x = torch.randn(4, 2, 3)
+        y = torch.randn(1, 2, 1)
+        self.run_test(RemainderModel(), (x, y))
+
+    def test_remainder_scalar(self):
+        class RemainderModel(torch.nn.Module):
+            def forward(self, input):
+                return torch.remainder(input, 2.55)
+
+        x = torch.randint(10, (2, 3))
+        self.run_test(RemainderModel(), x)
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_fmod(self):
+        class FModModel(torch.nn.Module):
+            def forward(self, input, other):
+                return torch.fmod(input, other)
+
+        x = torch.randn(4, 2, 3)
+        y = torch.randn(1, 2, 1)
+        self.run_test(FModModel(), (x, y))
+
+    @skipIfUnsupportedMinOpsetVersion(10)
+    def test_fmod_scalar(self):
+        class FModModel(torch.nn.Module):
+            def forward(self, input):
+                return torch.fmod(input, 2.55)
+
+        x = torch.randint(10, (2, 3))
+        self.run_test(FModModel(), x)
+
     @skipIfUnsupportedMinOpsetVersion(9)
     def test_gelu(self):
         class GeluModel(torch.nn.Module):
diff --git a/torch/onnx/symbolic_helper.py b/torch/onnx/symbolic_helper.py
index d1966bc5ff7c1..048e69c36c19d 100644
--- a/torch/onnx/symbolic_helper.py
+++ b/torch/onnx/symbolic_helper.py
@@ -194,6 +194,11 @@ def _slice_helper(g, input, axes, starts, ends, steps=None, dynamic_slice=False)
         return _slice(g, input, axes, starts, ends, steps, dynamic_slice)
 
 
+def _is_fp(value):
+    type = value.type().scalarType()
+    return (type == 'Float') or (type == 'Double') or (type == 'Half')
+
+
 def _sort_helper(g, input, dim, decending=True, out=None):
     if out is not None:
         _unimplemented("Sort", "Out parameter is not supported")
@@ -237,6 +242,7 @@ def _interpolate_warning(interpolate_mode):
                   "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
                   "We recommend using opset 11 and above for models using this operator. ")
 
+
 def _interpolate_size_to_scales(g, input, output_size, dim):
     output_size = _maybe_get_const(output_size, 'is')
     if _is_value(output_size):
@@ -254,7 +260,6 @@ def _interpolate_size_to_scales(g, input, output_size, dim):
         scales = g.op("Constant", value_t=torch.tensor(scales_constant))
     return scales
 
-
 def _scatter_helper(g, self, dim, index, src):
     if _export_onnx_opset_version <= 10:
         from torch.onnx.symbolic_opset9 import scatter
diff --git a/torch/onnx/symbolic_opset10.py b/torch/onnx/symbolic_opset10.py
index 241c9dbc4c587..d0ce9e21a4892 100644
--- a/torch/onnx/symbolic_opset10.py
+++ b/torch/onnx/symbolic_opset10.py
@@ -168,3 +168,7 @@ def flip(g, input, dims):
                                   starts=[-1] * len(dims),
                                   ends=[-9223372036854775807] * len(dims),
                                   steps=[-1] * len(dims))
+
+
+def fmod(g, input, other):
+    return g.op("Mod", input, other, fmod_i=1)
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 4177f1a5567ac..b28f611c63730 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -1957,6 +1957,14 @@ def baddbmm(g, self, batch1, batch2, beta, alpha):
     mul_b = mul(g, self, g.op("Cast", beta, to_i=sym_help.cast_pytorch_to_onnx[dtype]))
     return add(g, mul_a, mul_b)
 
+def remainder(g, input, other):
+    div = g.op("Div", input, other)
+    if sym_help._is_fp(input):
+        div = g.op("Floor", div)
+    quo = g.op("Mul", div, other)
+    return g.op("Sub", input, quo)
+
+
 def gelu(g, self):
     _sqrt2 = 1.4142135623730951
     erf = g.op('Erf', div(g, self, torch.tensor(_sqrt2)))

From 6bb7433ad513f22ee2a2eb4f0e7cef59c7aef2a2 Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <cc.rafaz@zafar.cc>
Date: Thu, 3 Oct 2019 20:38:48 -0700
Subject: [PATCH 041/113] Replacing the skip_list with white_list in the
 qconfig propagation

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27183

Test Plan: Imported from OSS

Differential Revision: D17700548

Pulled By: zafartahirov

fbshipit-source-id: 18e6ffbda496b14ac1da1783f928ad539cdb1d16
---
 test/common_quantization.py            | 25 +++++++++++++++++
 test/test_quantization.py              | 14 +++++++++-
 torch/quantization/default_mappings.py | 22 +++++++++------
 torch/quantization/quantize.py         | 37 ++++++++++----------------
 4 files changed, 66 insertions(+), 32 deletions(-)

diff --git a/test/common_quantization.py b/test/common_quantization.py
index 485d454320a7d..d98b1af8758d8 100644
--- a/test/common_quantization.py
+++ b/test/common_quantization.py
@@ -596,3 +596,28 @@ def forward(self, x):
         out = out.view(-1, 3 * 2 * 2)
         out = self.fc(out)
         return out
+
+"""Model to make sure that the observers are not inserted into custom modules.
+"""
+class ModelWithNoQconfigPropagation(nn.Module):
+    class ListOutModule(nn.Module):
+        def __init__(self):
+            super(ModelWithNoQconfigPropagation.ListOutModule, self).__init__()
+
+        def forward(self, x):
+            # returns a list of tensors, not supported by observers
+            return [x]
+
+    def __init__(self):
+        super(ModelWithNoQconfigPropagation, self).__init__()
+        self.fc1 = nn.Linear(5, 5).to(dtype=torch.float)
+        self.quant = QuantStub()
+        self.dequant = DeQuantStub()
+        self.no_quant_module = self.ListOutModule()
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.fc1(x)
+        x = self.dequant(x)
+        x = self.no_quant_module(x)
+        return x
diff --git a/test/test_quantization.py b/test/test_quantization.py
index c141ad605eec1..6b737bcd9d7fc 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -22,7 +22,8 @@
     ModelWithFunctionals, \
     test_only_eval_fn, test_only_train_fn, \
     prepare_dynamic, convert_dynamic, SingleLayerLinearDynamicModel, \
-    TwoLayerLinearModel, NestedModel, ResNetBase, LSTMDynamicModel
+    TwoLayerLinearModel, NestedModel, ResNetBase, LSTMDynamicModel, \
+    ModelWithNoQconfigPropagation
 
 from common_quantization import AnnotatedTwoLayerLinearModel, AnnotatedNestedModel, \
     AnnotatedSubNestedModel, AnnotatedCustomConfigNestedModel
@@ -1070,6 +1071,17 @@ def test_observer_scriptable(self):
         loaded = torch.jit.load(buf)
         self.assertEqual(obs.calculate_qparams(), loaded.calculate_qparams())
 
+    def test_no_qconfig_propagation(self):
+        model = ModelWithNoQconfigPropagation()
+        model.qconfig = torch.quantization.default_qconfig
+
+        model = prepare(model)
+        self.assertTrue(hasattr(model.fc1, 'qconfig'),
+                        "QConfig is expected to propagate")
+        self.assertFalse(hasattr(model.no_quant_module, 'qconfig'),
+                         "QConfig is expected to NOT propagate")
+
+
 @unittest.skipUnless('fbgemm' in torch.backends.quantized.supported_engines,
                      " Quantized operations require FBGEMM. FBGEMM is only optimized for CPUs"
                      " with instruction set support avx2 or newer.")
diff --git a/torch/quantization/default_mappings.py b/torch/quantization/default_mappings.py
index b5fb1b017f2d0..5b6149937da1b 100644
--- a/torch/quantization/default_mappings.py
+++ b/torch/quantization/default_mappings.py
@@ -48,12 +48,18 @@
     nn.LSTM: nnqd.LSTM,
 }
 
-# List of modules to skip the qconfig propagation
-DEFAULT_SKIP_LIST = (
-    nn.Dropout,
-    nn.Identity,
-    nn.MaxPool2d,
-    nn.AvgPool2d,
-    nn.AdaptiveAvgPool2d,
-    DeQuantStub
+# Whitelist for propagating the qconfig
+_EXCLUDE_QCONFIG_PROPAGATE_LIST = {
+    DeQuantStub,
+}
+_INCLUDE_QCONFIG_PROPAGATE_LIST = {
+    nn.Sequential,
+}
+
+DEFAULT_QCONFIG_PROPAGATE_WHITE_LIST = (
+    set(DEFAULT_MODULE_MAPPING.keys()) |
+    set(DEFAULT_QAT_MODULE_MAPPING.keys()) |
+    set(DEFAULT_DYNAMIC_MODULE_MAPPING.keys()) |
+    _INCLUDE_QCONFIG_PROPAGATE_LIST -
+    _EXCLUDE_QCONFIG_PROPAGATE_LIST
 )
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index fed6b57a2d06a..75457ca3830fb 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -12,11 +12,11 @@
 from .default_mappings import (DEFAULT_DYNAMIC_MODULE_MAPPING,
                                DEFAULT_MODULE_MAPPING,
                                DEFAULT_QAT_MODULE_MAPPING,
-                               DEFAULT_SKIP_LIST)
+                               DEFAULT_QCONFIG_PROPAGATE_WHITE_LIST)
 from .stubs import DeQuantStub, QuantWrapper
 from .QConfig import default_dynamic_qconfig, float16_dynamic_qconfig
 
-def _propagate_qconfig_helper(module, qconfig_dict, skip_list=None,
+def _propagate_qconfig_helper(module, qconfig_dict, white_list=None,
                               qconfig_parent=None, prefix=''):
     r"""This is a helper function for `propagate_qconfig_`
 
@@ -24,6 +24,7 @@ def _propagate_qconfig_helper(module, qconfig_dict, skip_list=None,
         module: input module
         qconfig_dict: dictionary that maps from name of submodule to quantization
                      configuration
+        white_list: list of quantizable modules
         qconfig_parent: quantization config of parent module, we will fallback to
                        this config when there is no specified config for current
                        module
@@ -33,32 +34,22 @@ def _propagate_qconfig_helper(module, qconfig_dict, skip_list=None,
     Return:
         None, module is modified inplace with qconfig attached
     """
-    if skip_list is None:
-        skip_list = DEFAULT_SKIP_LIST
-    if type(module) in skip_list:
-        module.qconfig = None
-    if not hasattr(module, 'qconfig'):
-        module.qconfig = qconfig_parent
-        if qconfig_dict:
-            if prefix in qconfig_dict:
-                module.qconfig = qconfig_dict[prefix]
-            elif type(module) in qconfig_dict:
-                module.qconfig = qconfig_dict[type(module)]
-
-    # Don't quantize empty Sequential, empty Sequential is same as
-    # Identity, but we can't put Sequential into skip list because
-    # we also have non-empty Sequential and the qconfig needs to
-    # be propagated to child in that case
     # TODO: Add test
-    if len(module._modules) == 0 and type(module) == nn.Sequential:
-        module.qconfig = None
+    if white_list is None:
+        white_list = DEFAULT_QCONFIG_PROPAGATE_WHITE_LIST
 
+    module_qconfig = qconfig_dict.get(type(module), qconfig_parent)
+    module_qconfig = qconfig_dict.get(prefix, module_qconfig)
+    module_qconfig = getattr(module, 'qconfig', module_qconfig)
+
+    if type(module) in white_list:
+        module.qconfig = module_qconfig
     for name, child in module.named_children():
         module_prefix = prefix + '.' + name if prefix else name
-        _propagate_qconfig_helper(child, qconfig_dict, skip_list,
-                                  module.qconfig, module_prefix)
+        _propagate_qconfig_helper(child, qconfig_dict, white_list,
+                                  module_qconfig, module_prefix)
 
-# TODO(jerryzh): expose skip_list
+# TODO(jerryzh): expose white_list
 def propagate_qconfig_(module, qconfig_dict=None):
     r"""Propagate qconfig through the module hierarchy and assign `qconfig`
     attribute on each leaf module

From ef97841147d50349444717af868cd0bdacf41996 Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <cc.rafaz@zafar.cc>
Date: Thu, 3 Oct 2019 20:46:33 -0700
Subject: [PATCH 042/113] Show a warning that not all dir members of quantized
 work. (#27339)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27339

This PR just shows a warning message.
Eventually we will show a correct __dir__

Test Plan: Imported from OSS

Differential Revision: D17751333

Pulled By: zafartahirov

fbshipit-source-id: e9bc62fd8dd0147979291d0aac3f1afe5b8c7a9f
---
 torch/tensor.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/torch/tensor.py b/torch/tensor.py
index 6d10ce59264f4..ac99e58ec7c3c 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -414,6 +414,8 @@ def __hash__(self):
         return id(self)
 
     def __dir__(self):
+        if self.is_quantized:
+            warnings.warn('Only a small subset of methods are supported for quantized tensors.')
         tensor_methods = dir(self.__class__)
         tensor_methods.remove('volatile')  # deprecated
         attrs = list(self.__dict__.keys())

From 9ade1e694438ddeae1101d0d10c1ff4a3b87a54b Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Thu, 3 Oct 2019 21:16:47 -0700
Subject: [PATCH 043/113] improve error messages when a method or attribute is
 missing (#27110)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27110

Previously missing methods on some types like tensors would talk about
'builtins' which are only a thing inside of the compiler. Furthermore,
the error would only occur when the builtin was applied and it was discovered
that no builtin existed. This changes the error message so that it
discovers that method on our builtin types does not exist on attribute lookup.

Test Plan: Imported from OSS

Differential Revision: D17677616

Pulled By: zdevito

fbshipit-source-id: 2f7cf6c6093a9c832569c44f4b1044a2e56fe205
---
 test/test_jit.py                        | 10 +--
 torch/csrc/jit/ir.cpp                   |  8 +++
 torch/csrc/jit/named_value.h            |  2 +
 torch/csrc/jit/script/sugared_value.cpp | 91 ++++++++++++++++---------
 torch/csrc/jit/script/sugared_value.h   |  7 ++
 5 files changed, 81 insertions(+), 37 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 4dc470ae391fa..9b9133741ea56 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -4598,7 +4598,7 @@ def test_method_on_number(self):
         def func():
             c = 1
             return c.add(1)
-        with self.assertRaisesRegex(RuntimeError, 'Cannot call methods on numbers'):
+        with self.assertRaisesRegex(RuntimeError, 'nonexistent attribute or method'):
             torch.jit.script(func)
 
     # testing implicit conversion of tensors to scalars to match function arguments
@@ -10668,7 +10668,7 @@ def forward(self, x):
             ReassignSelfRHS()
 
     def test_unknown_builtin(self):
-        with self.assertRaisesRegex(RuntimeError, 'Unknown builtin op'):
+        with self.assertRaisesRegex(RuntimeError, 'nonexistent attribute or method'):
             @torch.jit.script
             def unknown_builtin(x):
                 return x.splork(3)
@@ -11966,12 +11966,12 @@ def f(x):
 
         self.checkScript(f, (torch.rand(20, 20, 20),), optimize=True)
 
-        with self.assertRaisesRegex(RuntimeError, "Unknown attribute to named tuple"):
+        with self.assertRaisesRegex(RuntimeError, "nonexistent attribute"):
             @torch.jit.script
             def g1(x):
                 return x.max(dim=1).unknown_symbol
 
-        with self.assertRaisesRegex(RuntimeError, "Getting attributes of tuples is not supported"):
+        with self.assertRaisesRegex(RuntimeError, "nonexistent attribute"):
             @torch.jit.script
             def g2(x):
                 print((x, x, x).__doc__)
@@ -19910,7 +19910,7 @@ def call():  # noqa: E306
         for func in ops:
             self.checkScript(func, ())
 
-        with self.assertRaisesRegex(RuntimeError, "nonexistent attribute __add__. Did you forget to initialize it"):
+        with self.assertRaisesRegex(RuntimeError, "nonexistent attribute"):
             @torch.jit.script
             def test():
                 return Foo(torch.tensor(1)) + Foo(torch.tensor(1))
diff --git a/torch/csrc/jit/ir.cpp b/torch/csrc/jit/ir.cpp
index 8493d11e70d74..09c1f3ba55137 100644
--- a/torch/csrc/jit/ir.cpp
+++ b/torch/csrc/jit/ir.cpp
@@ -1711,6 +1711,14 @@ Node* ProfileOp::allocNewInstance(Graph* g) {
   return new ProfileOp(g, {nullptr});
 }
 
+TypePtr NamedValue::type() const {
+  if (value_) {
+    return value_->type();
+  } else {
+    return ivalue_.type();
+  }
+}
+
 constexpr Symbol ProfileOp::Kind;
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/named_value.h b/torch/csrc/jit/named_value.h
index e9a7a2a4b30c4..c27abdea37ebd 100644
--- a/torch/csrc/jit/named_value.h
+++ b/torch/csrc/jit/named_value.h
@@ -70,6 +70,8 @@ struct NamedValue {
     return *loc_;
   }
 
+  at::TypePtr type() const;
+
  private:
   c10::optional<SourceRange> loc_;
   c10::optional<std::string> name_;
diff --git a/torch/csrc/jit/script/sugared_value.cpp b/torch/csrc/jit/script/sugared_value.cpp
index fb7c16a18efcf..05b186c315281 100644
--- a/torch/csrc/jit/script/sugared_value.cpp
+++ b/torch/csrc/jit/script/sugared_value.cpp
@@ -87,51 +87,53 @@ std::shared_ptr<SugaredValue> SimpleValue::attr(
       return std::make_shared<SimpleValue>(r);
     }
   }
-  if (value_->type()->isSubtypeOf(NumberType::get())) {
-    throw ErrorReport(loc) << "Cannot call methods on numbers";
-  }
+
+  // accessing fields of named tuples
   if (auto tuple_type = value_->type()->cast<TupleType>()) {
-    if (!tuple_type->schema()) {
-      throw ErrorReport(loc) << "Getting attributes of tuples is not supported";
-    }
-    auto attrs = tuple_type->schema()->arguments();
-    for (size_t i = 0; i < attrs.size(); i++) {
-      if (attrs[i].name() == field) {
-        auto idx = m.graph()->insertConstant(IValue(static_cast<int64_t>(i)));
-        auto out_type = tuple_type->elements().at(i);
-        auto r =
-            m.graph()
-                ->insertNode(m.graph()->createTupleIndex(value_, idx, out_type))
-                ->output();
-        return std::make_shared<SimpleValue>(r);
+    if (tuple_type->schema()) {
+      auto attrs = tuple_type->schema()->arguments();
+      for (size_t i = 0; i < attrs.size(); i++) {
+        if (attrs[i].name() == field) {
+          auto idx = m.graph()->insertConstant(IValue(static_cast<int64_t>(i)));
+          auto out_type = tuple_type->elements().at(i);
+          auto r = m.graph()
+                       ->insertNode(
+                           m.graph()->createTupleIndex(value_, idx, out_type))
+                       ->output();
+          return std::make_shared<SimpleValue>(r);
+        }
       }
     }
-    throw ErrorReport(loc) << "Unknown attribute to named tuple";
-  }
-
-  if (auto classType = value_->type()->cast<ClassType>()) {
+  } else if (auto classType = value_->type()->cast<ClassType>()) {
     // This is a class, emit the proper attribute lookup
     if (auto method = classType->getMethod(field)) {
       return std::make_shared<MethodValue>(getValue(), field);
     }
-    if (!classType->hasAttribute(field)) {
-      throw ErrorReport(loc)
-          << "Tried to access nonexistent attribute " << field
-          << ". Did you forget to initialize it in __init__()?";
+    if (classType->hasAttribute(field)) {
+      auto& g = *m.graph();
+      auto n = g.insertNode(g.createGetAttr(value_, field));
+      return std::make_shared<SimpleValue>(n->output());
     }
-    auto& g = *m.graph();
-    auto n = g.insertNode(g.createGetAttr(value_, field));
-    return std::make_shared<SimpleValue>(n->output());
-  }
-
-  if (auto iface = value_->type()->cast<InterfaceType>()) {
+  } else if (auto iface = value_->type()->cast<InterfaceType>()) {
+    // accessing methods of interfaces
     if (auto schema = iface->getMethod(field)) {
       return std::make_shared<MethodValue>(getValue(), field);
     }
   }
 
-  return std::make_shared<BuiltinFunction>(
-      Symbol::aten(field), NamedValue(loc, "self", value_));
+  // none of the more-specific cases worked, so see if this is a builtin method
+  if (auto builtin = BuiltinFunction::tryCreate(
+          Symbol::aten(field), NamedValue(loc, "self", value_))) {
+    return builtin;
+  }
+
+  ErrorReport report(loc);
+  report << "Tried to access nonexistent attribute or method '" << field
+         << "' of type '" << value_->type()->python_str() << "'.";
+  if (value_->type()->kind() == ClassType::Kind) {
+    report << " Did you forget to initialize an attribute in __init__()?";
+  }
+  throw report;
 }
 
 std::vector<std::shared_ptr<SugaredValue>> SimpleValue::asTuple(
@@ -501,6 +503,31 @@ std::shared_ptr<SugaredValue> NamedTupleConstructor::call(
   return std::make_shared<SimpleValue>(self);
 }
 
+std::shared_ptr<BuiltinFunction> BuiltinFunction::tryCreate(
+    Symbol symbol,
+    c10::optional<NamedValue> self) {
+  for (const std::shared_ptr<Operator>& op : getAllOperatorsFor(symbol)) {
+    if (!self) {
+      return std::make_shared<BuiltinFunction>(symbol, nullptr);
+    }
+    if (auto index = op->schema().argumentIndexWithName("self")) {
+      std::unordered_map<std::string, TypePtr> type_env;
+      TypePtr formal_type = op->schema().arguments().at(*index).type();
+      const MatchTypeReturn matched =
+          matchTypeVariables(formal_type, self->type(), type_env);
+      if (!matched.success()) {
+        continue;
+      }
+      const auto concrete_type = tryEvalTypeVariables(formal_type, type_env);
+      if (!concrete_type || !self->type()->isSubtypeOf(concrete_type)) {
+        continue;
+      }
+      return std::make_shared<BuiltinFunction>(symbol, self);
+    }
+  }
+  return nullptr;
+}
+
 } // namespace script
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/script/sugared_value.h b/torch/csrc/jit/script/sugared_value.h
index b838b0de1f91c..5fd61f0a01431 100644
--- a/torch/csrc/jit/script/sugared_value.h
+++ b/torch/csrc/jit/script/sugared_value.h
@@ -168,6 +168,13 @@ struct TORCH_API BuiltinFunction : public SugaredValue {
       at::ArrayRef<NamedValue> attributes,
       at::ArrayRef<NamedValue> inputs,
       size_t n_binders) override;
+
+  // try to create this builtin but if it doesn't exist or the self argument
+  // cannot possibly match, then return nullptr. Use in situations where it is
+  // not clear if it is a valid builtin
+  static std::shared_ptr<BuiltinFunction> tryCreate(
+      Symbol symbol,
+      c10::optional<NamedValue> self);
 };
 
 struct TORCH_API BuiltinModule : public SugaredValue {

From 2ea1d3d01ff3dc0f5d13216673154a42c62e9edc Mon Sep 17 00:00:00 2001
From: Zachary DeVito <zdevito@fb.com>
Date: Thu, 3 Oct 2019 21:17:28 -0700
Subject: [PATCH 044/113] refactor extra sugared values (#26270)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26270

We've accumulated a lot of sugared values whose only purpose is
to be instanced-checked against in emitApplyExpr. I need to add
another one to insert an unchecked_cast, and do not want to continue
the pattern. This creates an abstraction for this concept (SpecialFormValue),
and removes all the unneeded sugared values. There is no functionality
change here just a bunch of code movement in compiler.cpp

Test Plan: Imported from OSS

Differential Revision: D17412854

Pulled By: zdevito

fbshipit-source-id: 15877c91decaea5a00d1fe737ed2d0f0f8a79a28
---
 aten/src/ATen/core/interned_strings.h         |   1 +
 torch/csrc/jit/import_source.cpp              |   6 +-
 torch/csrc/jit/script/compiler.cpp            | 347 +++++++++---------
 .../csrc/jit/script/python_sugared_value.cpp  |   4 +-
 torch/csrc/jit/script/sugared_value.cpp       |   2 +-
 torch/csrc/jit/script/sugared_value.h         |  89 +----
 6 files changed, 206 insertions(+), 243 deletions(-)

diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h
index e5837680ff2f4..f065e6221e341 100644
--- a/aten/src/ATen/core/interned_strings.h
+++ b/aten/src/ATen/core/interned_strings.h
@@ -117,6 +117,7 @@ namespace c10 {
   _(prim, CallFunction)              \
   _(prim, CallMethod)                \
   _(prim, LoopContinuation)          \
+  _(prim, annotate)          \
   _(aten, append)                    \
   _(aten, item)                      \
   _(aten, format)                    \
diff --git a/torch/csrc/jit/import_source.cpp b/torch/csrc/jit/import_source.cpp
index 26ed8fe3508e2..682b14f73d5f5 100644
--- a/torch/csrc/jit/import_source.cpp
+++ b/torch/csrc/jit/import_source.cpp
@@ -109,9 +109,9 @@ struct SourceImporterImpl : public Resolver,
         // Constants present in the model. Used to resolve "CONSTANTS.n" to the
         // actual value
         {"CONSTANTS", std::make_shared<ConstantTableValue>(tensor_table)},
-        {"fork", std::make_shared<ForkValue>()},
-        {"annotate", std::make_shared<AnnotateValue>()},
-        {"uninitialized", std::make_shared<UninitializedValue>()},
+        {"fork", SpecialFormValue::create(prim::fork)},
+        {"annotate", SpecialFormValue::create(prim::annotate)},
+        {"uninitialized", SpecialFormValue::create(prim::Uninitialized)},
         {"inf",
          std::make_shared<ConstantValue>(
              std::numeric_limits<double>::infinity())},
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index c64408eed21be..c95b689a0297a 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -421,7 +421,7 @@ struct Environment {
     if (!retval) {
       static std::unordered_map<std::string, SugaredValuePtr> globals = {
           {"print", std::make_shared<PrintValue>()},
-          {"tuple", std::make_shared<TupleCallValue>()},
+          {"tuple", SpecialFormValue::create(prim::TupleConstruct)},
           {"float",
            makeMagic(
                "__float__",
@@ -438,8 +438,8 @@ struct Environment {
            makeMagic(
                "__str__",
                std::make_shared<CastValue>(StringType::get(), aten::str))},
-          {"getattr", std::make_shared<GetAttrValue>()},
-          {"isinstance", std::make_shared<IsInstanceValue>()},
+          {"getattr", SpecialFormValue::create(prim::GetAttr)},
+          {"isinstance", SpecialFormValue::create(prim::isinstance)},
           // todo(zach): remove when we can correctly export torch.full via ONNX
           // or we have implicit conversion that can convert numbers to tensors
           {"_to_tensor",
@@ -471,9 +471,9 @@ struct Environment {
           {"ord", std::make_shared<BuiltinFunction>(aten::ord, at::nullopt)},
           {"chr", std::make_shared<BuiltinFunction>(aten::chr, at::nullopt)},
           {"bin", std::make_shared<BuiltinFunction>(aten::bin, at::nullopt)},
-          {"range", std::make_shared<IterableValue>(prim::range)},
-          {"zip", std::make_shared<IterableValue>(prim::zip)},
-          {"enumerate", std::make_shared<IterableValue>(prim::enumerate)},
+          {"range", SpecialFormValue::create(prim::range)},
+          {"zip", SpecialFormValue::create(prim::zip)},
+          {"enumerate", SpecialFormValue::create(prim::enumerate)},
           {"rangelist",
            std::make_shared<BuiltinFunction>(prim::rangelist, at::nullopt)},
           {"sorted",
@@ -2346,104 +2346,186 @@ struct to_ir {
   std::shared_ptr<SugaredValue> emitApplyExpr(Apply& apply, size_t n_binders) {
     auto sv = emitSugaredExpr(apply.callee(), 1);
     auto loc = apply.callee().range();
-    if (auto fork_value = dynamic_cast<ForkValue*>(sv.get())) {
-      auto& trees = apply.inputs().tree()->trees();
-      if (trees.size() < 1) {
-        throw ErrorReport(loc) << "Expected at least one argument to fork()";
-      }
-      auto forked = emitSugaredExpr(Expr(trees[0]), 1);
-      TreeList sliced_trees(trees.begin() + 1, trees.end());
-      auto inputs = getNamedValues(sliced_trees, true);
-      auto attributes = emitAttributes(apply.attributes());
-      return emitForkExpr(loc, forked, inputs, attributes);
-    } else if (auto annotate_value = dynamic_cast<AnnotateValue*>(sv.get())) {
-      checkApplyNumInputs(apply, 2);
-      TypePtr type = typeParser_.parseTypeFromExpr(apply.inputs()[0]);
-      Value* expr = tryConvertToType(
-          apply.range(),
-          *graph,
-          type,
-          emitExpr(apply.inputs()[1], type),
-          /*allow_conversions=*/true);
+    if (auto special_form = dynamic_cast<SpecialFormValue*>(sv.get())) {
+      return emitApplySpecialForm(special_form->form(), apply);
+    }
+    auto inputs = getNamedValues(apply.inputs(), true);
+    auto attributes = emitAttributes(apply.attributes());
+    return sv->call(loc, method, inputs, attributes, n_binders);
+  }
+
+  // this function handles expressions that look like apply statements
+  // but have special evaluation rules for the arguments.
+  // when adding a new case, only add a special form if it cannot be expressed
+  // using the standard SugaredValue::call function, which enforces normal
+  // evaluation order.
+  std::shared_ptr<SugaredValue> emitApplySpecialForm(
+      Symbol form,
+      Apply& apply) {
+    switch (form) {
+      case prim::fork: {
+        auto& trees = apply.inputs().tree()->trees();
+        if (trees.size() < 1) {
+          throw ErrorReport(apply)
+              << "Expected at least one argument to fork()";
+        }
+        auto forked = emitSugaredExpr(Expr(trees[0]), 1);
+        TreeList sliced_trees(trees.begin() + 1, trees.end());
+        auto inputs = getNamedValues(sliced_trees, true);
+        auto attributes = emitAttributes(apply.attributes());
+        return emitForkExpr(apply.range(), forked, inputs, attributes);
+      }
+      case prim::annotate: {
+        checkApplyNumInputs(apply, 2);
+        TypePtr type = typeParser_.parseTypeFromExpr(apply.inputs()[0]);
+        Value* expr = tryConvertToType(
+            apply.range(),
+            *graph,
+            type,
+            emitExpr(apply.inputs()[1], type),
+            /*allow_conversions=*/true);
 
-      std::stringstream why_not;
-      if (!expr->type()->isSubtypeOfExt(type, &why_not)) {
-        throw ErrorReport(apply.inputs())
-            << "expected an expression of type " << type->python_str()
-            << " but found " << expr->type()->python_str() << "\n"
-            << why_not.str();
-      }
-
-      // None is a subtype of Optional[T], but we want to remember what T is,
-      // after annotation so that variables assigned to this None will still
-      // get the right type. To do this, we make a None constant that
-      // has the type Optional[T]
-      if (type->kind() == OptionalType::Kind &&
-          expr->type()->isSubtypeOf(NoneType::get())) {
-        Node* none = graph->createNone();
-        none->output()->setType(type);
-        graph->insertNode(none);
-        expr = none->output();
-      }
-
-      return std::make_shared<SimpleValue>(expr);
-    } else if (auto getattr = dynamic_cast<GetAttrValue*>(sv.get())) {
-      checkApplyNumInputs(apply, 2);
-      auto obj = emitSugaredExpr(apply.inputs()[0], 1);
-      auto selector = apply.inputs()[1];
-      if (selector.kind() != TK_STRINGLITERAL) {
-        throw ErrorReport(loc)
-            << "getattr's second argument must be a string literal";
-      }
-      const std::string& name = StringLiteral(selector).text();
-      return obj->attr(apply.range(), method, name);
-    } else if (
-        auto uninitialized_value =
-            dynamic_cast<UninitializedValue*>(sv.get())) {
-      checkApplyNumInputs(apply, 1);
-      TypePtr type = typeParser_.parseTypeFromExpr(apply.inputs()[0]);
-      auto out = graph->insertNode(graph->createUninitialized(type))
-                     ->setSourceRange(loc);
-      return std::make_shared<SimpleValue>(out->output());
-    } else if (auto tuple_call = dynamic_cast<TupleCallValue*>(sv.get())) {
-      checkApplyNumInputs(apply, 1);
-      auto arg = emitSugaredExpr(apply.inputs()[0], 1);
-      auto inputs = arg->asTuple(apply.range(), method);
-      auto inp_values = fmap(inputs, [&](const SugaredValuePtr& sv) {
-        return sv->asValue(loc, method);
-      });
-      return std::make_shared<SimpleValue>(
-          graph->insertNode(graph->createTuple(inp_values))->output());
-    } else if (auto isinstance = dynamic_cast<IsInstanceValue*>(sv.get())) {
-      checkApplyNumInputs(apply, 2);
-      auto result = emitIsInstance(apply.inputs()[0], apply.inputs()[1]);
-      return std::make_shared<SimpleValue>(result.value());
-    } else if (auto classNew = dynamic_cast<ClassNewMethod*>(sv.get())) {
-      if (apply.inputs().size() != 1) {
-        throw ErrorReport(loc) << "Only one argument to __new__ allowed";
-      }
-      auto arg = emitSugaredExpr(apply.inputs()[0], 1);
-      auto class_arg = dynamic_cast<ClassValue*>(arg.get());
-      if (!class_arg) {
-        throw ErrorReport(loc)
-            << "Expected class value as argument to __new__, got "
-            << arg->kind() << " instead";
-      }
-      if (class_arg->type_ != classNew->type_) {
-        throw ErrorReport(loc)
-            << "Argument to __new__() must match the class "
-            << "you are calling __new__() on. "
-            << "Got: " << class_arg->type_->python_str()
-            << ", expected: " << classNew->type_->python_str();
+        std::stringstream why_not;
+        if (!expr->type()->isSubtypeOfExt(type, &why_not)) {
+          throw ErrorReport(apply.inputs())
+              << "expected an expression of type " << type->python_str()
+              << " but found " << expr->type()->python_str() << "\n"
+              << why_not.str();
+        }
+
+        // None is a subtype of Optional[T], but we want to remember what T is,
+        // after annotation so that variables assigned to this None will still
+        // get the right type. To do this, we make a None constant that
+        // has the type Optional[T]
+        if (type->kind() == OptionalType::Kind &&
+            expr->type()->isSubtypeOf(NoneType::get())) {
+          Node* none = graph->createNone();
+          none->output()->setType(type);
+          graph->insertNode(none);
+          expr = none->output();
+        }
+
+        return std::make_shared<SimpleValue>(expr);
       }
+      case prim::GetAttr: {
+        checkApplyNumInputs(apply, 2);
+        auto obj = emitSugaredExpr(apply.inputs()[0], 1);
+        auto selector = apply.inputs()[1];
+        if (selector.kind() != TK_STRINGLITERAL) {
+          throw ErrorReport(apply)
+              << "getattr's second argument must be a string literal";
+        }
+        const std::string& name = StringLiteral(selector).text();
+        return obj->attr(apply.range(), method, name);
+      }
+      case prim::Uninitialized: {
+        checkApplyNumInputs(apply, 1);
+        TypePtr type = typeParser_.parseTypeFromExpr(apply.inputs()[0]);
+        auto out = graph->insertNode(graph->createUninitialized(type))
+                       ->setSourceRange(apply.range());
+        return std::make_shared<SimpleValue>(out->output());
+      }
+      case prim::TupleConstruct: {
+        checkApplyNumInputs(apply, 1);
+        auto arg = emitSugaredExpr(apply.inputs()[0], 1);
+        auto inputs = arg->asTuple(apply.range(), method);
+        auto inp_values = fmap(inputs, [&](const SugaredValuePtr& sv) {
+          return sv->asValue(apply.range(), method);
+        });
+        return std::make_shared<SimpleValue>(
+            graph->insertNode(graph->createTuple(inp_values))->output());
+      }
+      case prim::isinstance: {
+        checkApplyNumInputs(apply, 2);
+        auto result = emitIsInstance(apply.inputs()[0], apply.inputs()[1]);
+        return std::make_shared<SimpleValue>(result.value());
+      }
+      // This represents the "__new__" method on classes
+      // because it takes a ClassValue as input.
+      // So if we see:
+      //   Foo.__new__(Foo)
+      // Foo is a ClassValue, calling `attr("__new__")` will return a
+      // CreateObject special form.
+      case prim::CreateObject: {
+        if (apply.inputs().size() != 1) {
+          throw ErrorReport(apply) << "Only one argument to __new__ allowed";
+        }
+        auto arg = emitSugaredExpr(apply.inputs()[0], 1);
+        auto class_arg = dynamic_cast<ClassValue*>(arg.get());
+        if (!class_arg) {
+          throw ErrorReport(apply)
+              << "Expected class value as argument to __new__, got "
+              << arg->kind() << " instead";
+        }
+        auto createNode =
+            graph->insertNode(graph->createObject(class_arg->type_));
+        return std::make_shared<SimpleValue>(createNode->output());
+      }
+      // We construct the iterable tree here using the IterableTree
+      // SugaredValue, The tree consists of SimpleValue, RangeValue or
+      // IterableValue: For SimpleValues(List, Dict, etc) or RangeValue. We will
+      // make them as tree leaves since we could get the loop information from
+      // len() and get_item(). For IterableValue like zip(), enumerate(), we can
+      // model them as a combination of leaves, and we emit a IterableTree value
+      // to record the tree information
+      case prim::range: {
+        std::vector<Value*> input_vals =
+            getValues(apply.inputs(), /*maybe_unpack=*/true);
+        return std::make_shared<RangeValue>(apply.range(), method, input_vals);
+      }
+      case prim::enumerate: {
+        const SourceRange& loc = apply.range();
+        auto inputs = apply.inputs();
+        auto input_size = apply.inputs().size();
+        // enumerate(x) can be rewrite as subtrees:
+        // IterableTree(RangeValue(0, math.inf), SimpleValue(x))
+        Value* start_index = nullptr;
+        if (input_size == 0) {
+          throw ErrorReport(loc)
+              << "enumerate expected at least 1 arguments, got 0";
+        }
 
-      return classNew->createObject(apply.range(), method);
-    } else if (auto iterable = std::dynamic_pointer_cast<IterableValue>(sv)) {
-      return emitIterableTree(loc, apply.inputs(), iterable);
-    } else {
-      auto inputs = getNamedValues(apply.inputs(), true);
-      auto attributes = emitAttributes(apply.attributes());
-      return sv->call(loc, method, inputs, attributes, n_binders);
+        if (input_size == 2) {
+          start_index = emitSugaredExpr(inputs[1], 1)->asValue(loc, method);
+        }
+
+        if (input_size > 2) {
+          throw ErrorReport(loc)
+              << "enumerate expected at most 2 arguments, got " << input_size;
+        }
+        std::vector<Value*> range_inputs;
+        if (start_index != nullptr) {
+          range_inputs.emplace_back(start_index);
+        }
+        Value* end = materializeConstant(
+            std::numeric_limits<int64_t>::max(),
+            *graph,
+            loc,
+            integral_constants);
+        range_inputs.emplace_back(end);
+        SugaredValuePtr range_sv =
+            std::make_shared<RangeValue>(loc, method, range_inputs);
+        SugaredValuePtr expr_sv = emitSugaredExpr(inputs[0], 1);
+        return std::make_shared<IterableTree>(
+            std::vector<SugaredValuePtr>({range_sv, expr_sv}));
+      }
+      case prim::zip: {
+        // zip(x, y) can be rewrite as subtrees:
+        // IterableTree(IterableTree(x), IterableTree(y))
+        auto inputs = apply.inputs();
+        if (inputs.size() == 0) {
+          throw ErrorReport(apply)
+              << "zip expected at least 1 arguments, got 0";
+        }
+        auto iterable_tree = std::make_shared<IterableTree>();
+        for (Expr expr : inputs) {
+          auto expr_sv = emitSugaredExpr(expr, 1);
+          iterable_tree->addChild(expr_sv);
+        }
+        return iterable_tree;
+      }
+      default:
+        TORCH_INTERNAL_ASSERT(false, "unknown special form: ", form);
     }
   }
 
@@ -2522,69 +2604,6 @@ struct to_ir {
     return graph->insertConstant(stack[0], tree->range());
   }
 
-
-  // We construct the iterable tree here using the IterableTree SugaredValue,
-  // The tree consists of SimpleValue, RangeValue or IterableValue:
-  // For SimpleValues(List, Dict, etc) or RangeValue. We will make them as tree
-  // leaves since we could get the loop information from len() and get_item().
-  // For IterableValue like zip(), enumerate(), we can model them as a
-  // combination of leaves, and we emit a IterableTree value to record the tree
-  // information
-  SugaredValuePtr emitIterableTree(
-      SourceRange& loc,
-      const List<Expr>& inputs,
-      const std::shared_ptr<IterableValue>& iterable) {
-    std::shared_ptr<IterableTree> iterable_tree = nullptr;
-    size_t input_size = inputs.size();
-
-    // Handling different iterable values
-    if (iterable->symbol_ == prim::range) {
-      std::vector<Value*> input_vals = getValues(inputs, /*maybe_unpack=*/true);
-      return std::make_shared<RangeValue>(loc, method, input_vals);
-    } else if (iterable->symbol_ == prim::enumerate) {
-      // enumerate(x) can be rewrite as subtrees:
-      // IterableTree(RangeValue(0, math.inf), SimpleValue(x))
-      Value* start_index = nullptr;
-      if (input_size == 0) {
-        throw ErrorReport(loc)
-            << "enumerate expected at least 1 arguments, got 0";
-      }
-
-      if (input_size == 2) {
-        start_index = emitSugaredExpr(inputs[1], 1)->asValue(loc, method);
-      }
-
-      if (input_size > 2) {
-        throw ErrorReport(loc)
-            << "enumerate expected at most 2 arguments, got " << input_size;
-      }
-      std::vector<Value*> range_inputs;
-      if (start_index != nullptr) {
-        range_inputs.emplace_back(start_index);
-      }
-      Value* end = materializeConstant(
-          std::numeric_limits<int64_t>::max(), *graph, loc, integral_constants);
-      range_inputs.emplace_back(end);
-      SugaredValuePtr range_sv =
-          std::make_shared<RangeValue>(loc, method, range_inputs);
-      SugaredValuePtr expr_sv = emitSugaredExpr(inputs[0], 1);
-      iterable_tree = std::make_shared<IterableTree>(
-          std::vector<SugaredValuePtr>({range_sv, expr_sv}));
-    } else if (iterable->symbol_ == prim::zip) {
-      // zip(x, y) can be rewrite as subtrees:
-      // IterableTree(IterableTree(x), IterableTree(y))
-      if (inputs.size() == 0) {
-        throw ErrorReport(loc) << "zip expected at least 1 arguments, got 0";
-      }
-      iterable_tree = std::make_shared<IterableTree>();
-      for (Expr expr : inputs) {
-        auto expr_sv = emitSugaredExpr(expr, 1);
-        iterable_tree->addChild(expr_sv);
-      }
-    }
-    return iterable_tree;
-  }
-
   std::shared_ptr<SugaredValue> emitForkExpr(
       SourceRange loc,
       const std::shared_ptr<SugaredValue>& forked,
diff --git a/torch/csrc/jit/script/python_sugared_value.cpp b/torch/csrc/jit/script/python_sugared_value.cpp
index 9e28bab926751..90259466cac08 100644
--- a/torch/csrc/jit/script/python_sugared_value.cpp
+++ b/torch/csrc/jit/script/python_sugared_value.cpp
@@ -578,10 +578,10 @@ std::shared_ptr<SugaredValue> toSugaredValue(
   } else if (py::isinstance<py::module>(obj)) {
     return std::make_shared<PythonModuleValue>(obj);
   } else if (obj.ptr() == py::module::import("torch.jit").attr("_fork").ptr()) {
-    return std::make_shared<ForkValue>();
+    return SpecialFormValue::create(prim::fork);
   } else if (
       obj.ptr() == py::module::import("torch.jit").attr("annotate").ptr()) {
-    return std::make_shared<AnnotateValue>();
+    return SpecialFormValue::create(prim::annotate);
   } else if (auto callee = as_module(obj)) {
     throw ErrorReport(loc) << "Cannot call a ScriptModule that is not"
                            << " a submodule of the caller";
diff --git a/torch/csrc/jit/script/sugared_value.cpp b/torch/csrc/jit/script/sugared_value.cpp
index 05b186c315281..a07dd95061076 100644
--- a/torch/csrc/jit/script/sugared_value.cpp
+++ b/torch/csrc/jit/script/sugared_value.cpp
@@ -476,7 +476,7 @@ std::shared_ptr<SugaredValue> ClassValue::attr(
   if (field != "__new__") {
     throw ErrorReport(loc) << "Tried to lookup unknown attribute on class";
   }
-  return std::make_shared<ClassNewMethod>(type_);
+  return SpecialFormValue::create(prim::CreateObject);
 }
 
 std::shared_ptr<SugaredValue> NamedTupleConstructor::call(
diff --git a/torch/csrc/jit/script/sugared_value.h b/torch/csrc/jit/script/sugared_value.h
index 5fd61f0a01431..edf543c02f2ec 100644
--- a/torch/csrc/jit/script/sugared_value.h
+++ b/torch/csrc/jit/script/sugared_value.h
@@ -393,53 +393,27 @@ struct TORCH_API MagicMethod : public SugaredValue {
   std::string desugared_name_;
 };
 
-// These SugaredValues have special handling in the compiler because they
-// change the normal evalution order of the expression they participate in.
-// They are exposed here so that the python frontend can inject them
-// when it sees the equivalent thing in python
-
-struct TORCH_API ForkValue : public SugaredValue {
-  ForkValue() = default;
-  std::string kind() const override {
-    return "fork";
-  }
-};
-struct TORCH_API AnnotateValue : public SugaredValue {
-  AnnotateValue() = default;
-  std::string kind() const override {
-    return "annotate";
-  }
-};
-
-struct TORCH_API UninitializedValue : public SugaredValue {
-  UninitializedValue() = default;
+// things that look like function applications, but
+// perform non-standard evaluation are represented
+// with SpecialFormValues, e.g.
+//   isinstance(x, int)
+//   fork(fn)
+//   annotate(int, 3)
+// The implementation of each value is handled by a case inside emitApplyExpr
+struct TORCH_API SpecialFormValue : public SugaredValue {
+  SpecialFormValue(Symbol form) : form_(form) {}
   std::string kind() const override {
-    return "uninitialized";
+    return form_.toUnqualString();
   }
-};
-
-// matched against for special handling of getattr expressions
-struct TORCH_API GetAttrValue : SugaredValue {
-  GetAttrValue() = default;
-  std::string kind() const override {
-    return "getattr";
+  Symbol form() const {
+    return form_;
   }
-};
-
-// matched against for special handling of isinstance expressions
-struct TORCH_API IsInstanceValue : SugaredValue {
-  IsInstanceValue() = default;
-  std::string kind() const override {
-    return "isinstance";
+  static std::shared_ptr<SpecialFormValue> create(Symbol form) {
+    return std::make_shared<SpecialFormValue>(form);
   }
-};
 
-// matched against for special handling of tuple() call
-struct TORCH_API TupleCallValue : SugaredValue {
-  TupleCallValue() = default;
-  std::string kind() const override {
-    return "tuple";
-  }
+ private:
+  Symbol form_;
 };
 
 // matched against for special handling of range expressions
@@ -462,15 +436,6 @@ struct TORCH_API RangeValue : SugaredValue {
   bool has_only_end_;
 };
 
-// matched against for special handling of iterables like zip(), enumerate()
-struct TORCH_API IterableValue : SugaredValue {
-  IterableValue(Symbol symbol) : symbol_(symbol) {}
-  std::string kind() const override {
-    return "iterable";
-  }
-  Symbol symbol_;
-};
-
 // Specialized Tree structure to matched against for special handling
 // of builtin functions iterables expressions like zip(), enumerate(), etc.
 // zip and enumerate can be modeled as a tree of SimpleValue/RangeValue:
@@ -508,28 +473,6 @@ struct TORCH_API IterableTree : SugaredValue {
   std::vector<SugaredValuePtr> children_;
 };
 
-// This represents the "__new__" method on classes, which can't be a MethodValue
-// because it takes a ClassValue as input.
-// So if we see:
-//   Foo.__new__(Foo)
-// Foo is a ClassValue, calling `attr("__new__")` will return a ClassNewMethod.
-struct TORCH_API ClassNewMethod : public SugaredValue {
-  ClassNewMethod(ClassTypePtr type) : type_(type) {}
-  std::string kind() const override {
-    return "class.__new__";
-  }
-
-  std::shared_ptr<SugaredValue> createObject(
-      const SourceRange& loc,
-      Function& m) {
-    auto& g = *m.graph();
-    auto createNode = g.insertNode(g.createObject(type_));
-    return std::make_shared<SimpleValue>(createNode->output());
-  }
-
-  ClassTypePtr type_;
-};
-
 static inline std::vector<Value*> toValues(
     Graph& g,
     at::ArrayRef<NamedValue> nvs) {

From 42e7eb0426190e07339f03d4e6afb61b7ff5ae9c Mon Sep 17 00:00:00 2001
From: sribkain <sribkain@tgs.com>
Date: Thu, 3 Oct 2019 21:43:55 -0700
Subject: [PATCH 045/113] Minor readability fixes to C++ documentation (#27338)

Summary:
Changed `yieldings` to `yielding`.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27338

Differential Revision: D17758406

Pulled By: yf225

fbshipit-source-id: 1633834a6ad80449c061ebc330ac24f3e42f5506
---
 docs/cpp/source/index.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/cpp/source/index.rst b/docs/cpp/source/index.rst
index f620ff4208c82..b7d914c8517b2 100644
--- a/docs/cpp/source/index.rst
+++ b/docs/cpp/source/index.rst
@@ -53,7 +53,7 @@ ATen ``Tensor`` class with capabilities concerning automatic differentiation.
 The autograd system records operations on tensors to form an *autograd graph*.
 Calling ``backwards()`` on a leaf variable in this graph performs reverse mode
 differentiation through the network of functions and tensors spanning the
-autograd graph, ultimately yieldings gradients. The following example provides
+autograd graph, ultimately yielding gradients. The following example provides
 a taste of this interface:
 
 .. code-block:: cpp
@@ -68,7 +68,7 @@ a taste of this interface:
 
 The ``at::Tensor`` class in ATen is not differentiable by default. To add the
 differentiability of tensors the autograd API provides, you must use tensor
-factory functions from the `torch::` namespace instead of the `at` namespace.
+factory functions from the `torch::` namespace instead of the `at::` namespace.
 For example, while a tensor created with `at::ones` will not be differentiable,
 a tensor created with `torch::ones` will be.
 

From d0a4b2f586e0901c3c65f1f0e0bae15364e28821 Mon Sep 17 00:00:00 2001
From: Peter Bell <peterbell10@live.co.uk>
Date: Thu, 3 Oct 2019 23:30:02 -0700
Subject: [PATCH 046/113] Choose num_threads in parallel_for based on
 GRAIN_SIZE (#26963)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/24080, Continuation of https://github.com/pytorch/pytorch/issues/26886

What soumith said in https://github.com/pytorch/pytorch/pull/26886#issuecomment-535760635 seems plausible
> I wonder if it has to do with `#pragma omp parallel num_threads(num_threads)` which has unintended consequences, where even if `num_threads=1`, entering an omp block inside an omp block results in bad behavior.

I know for a fact that gcc's openmp doesn't start the thread pool when given `num_threads(1)` but it seems clang behaves differently.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26963

Differential Revision: D17626981

Pulled By: soumith

fbshipit-source-id: 484ffe6cc172382bb5ff49ce1fceda7eba20a512
---
 aten/src/ATen/ParallelOpenMP.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/aten/src/ATen/ParallelOpenMP.h b/aten/src/ATen/ParallelOpenMP.h
index e1e4b9bf44332..fecb9858d37d8 100644
--- a/aten/src/ATen/ParallelOpenMP.h
+++ b/aten/src/ATen/ParallelOpenMP.h
@@ -25,7 +25,13 @@ inline void parallel_for(
 #ifdef _OPENMP
   std::atomic_flag err_flag = ATOMIC_FLAG_INIT;
   std::exception_ptr eptr;
-#pragma omp parallel if (!omp_in_parallel() && ((end - begin) >= grain_size))
+  // choose number of tasks based on grain size and number of threads
+  int64_t num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
+  if (grain_size > 0) {
+    num_threads = std::min(num_threads, divup((end - begin), grain_size));
+  }
+
+#pragma omp parallel num_threads(num_threads)
   {
     int64_t num_threads = omp_get_num_threads();
     int64_t tid = omp_get_thread_num();

From 76f847546b46890b9c028c7a98f36db8674bc0f5 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Fri, 4 Oct 2019 00:22:34 -0700
Subject: [PATCH 047/113] Enable Python3.6 PyTorch ROCm CI

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27353

Differential Revision: D17758495

Pulled By: bddppq

fbshipit-source-id: 95e329bc30f092e4093a33c408f1647b803d9983
---
 .jenkins/pytorch/common.sh | 4 +++-
 .jenkins/pytorch/test.sh   | 6 ++++++
 test/common_device_type.py | 2 +-
 test/test_torch.py         | 2 ++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh
index 8d6764ec1ae09..d727dcd57272f 100644
--- a/.jenkins/pytorch/common.sh
+++ b/.jenkins/pytorch/common.sh
@@ -158,7 +158,9 @@ fi
 
 function pip_install() {
   # retry 3 times
-  pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@"
+  # old versions of pip don't have the "--progress-bar" flag
+  pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@" ||\
+  pip install "$@" || pip install "$@" || pip install "$@"
 }
 
 function pip_uninstall() {
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index e5cec7aeb6e07..ec2b05e2c62e3 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -32,6 +32,12 @@ if [ -n "${IN_CIRCLECI}" ]; then
   fi
 fi
 
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  # TODO: Move this to Docker
+  sudo apt-get -qq update
+  sudo apt-get -qq install --no-install-recommends libsndfile1
+fi
+
 # --user breaks ppc64le builds and these packages are already in ppc64le docker
 if [[ "$BUILD_ENVIRONMENT" != *ppc64le* ]]; then
   # JIT C++ extensions require ninja.
diff --git a/test/common_device_type.py b/test/common_device_type.py
index 101a303fa05ca..6f765ebf211c9 100644
--- a/test/common_device_type.py
+++ b/test/common_device_type.py
@@ -442,7 +442,7 @@ def wrap_fn(self, device, *args, **kwargs):
                 if self.no_cudnn:
                     reason = "cuDNN not available"
                     raise unittest.SkipTest(reason)
-                if self.cudnn_version < version:
+                if self.cudnn_version is None or self.cudnn_version < version:
                     reason = "cuDNN version {0} is available but {1} required".format(self.cudnn_version, version)
                     raise unittest.SkipTest(reason)
 
diff --git a/test/test_torch.py b/test/test_torch.py
index b5cf74e149967..7f73a3efe2b4a 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -11794,6 +11794,8 @@ def test_var_mean_some_dims(self, device):
 
     # passes on ROCm w/ python 2.7, fails w/ python 3.6
     @skipCUDAIfRocm
+    # stft -> rfft -> _fft -> _fft_with_size -> _fft_mkl
+    @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support")
     def test_stft(self, device):
         if not TEST_LIBROSA:
             raise unittest.SkipTest('librosa not found')

From 527b10c2d1b605828df3b4844436ef0636d2c5e6 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Fri, 4 Oct 2019 02:21:02 -0700
Subject: [PATCH 048/113] Fixes PackedSequence.to (and unifies PackedSequence
 conversions) (#27245)

Summary:
PackedSequence.to(device) incorrectly places one of three tensors on the device and leaves the other two tensors where they are. If these devices are distinct then further operations on PackedSequence will fail. This behavior is inconsistent with Tensor.to and PackedSequence's behavior when .cuda() is called.

Additionally, PackedSequence defines multiple other conversion functions that were independently and inconsistently implemented.

This PR unifies all implementations and makes the PackedSequence.to behavior more consistent with Tensor.to. It is not completely consistent per comments. test_device_mask in test_nn.py is updated to validate the new functionality.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27245

Differential Revision: D17757850

Pulled By: mruberry

fbshipit-source-id: 58f0bd40f1aa300fb0a91ee743483d645f977dc5
---
 test/test_nn.py       |   4 +-
 torch/nn/utils/rnn.py | 105 +++++++++++-------------------------------
 2 files changed, 31 insertions(+), 78 deletions(-)

diff --git a/test/test_nn.py b/test/test_nn.py
index e6dd049ddba71..0fe2c005eb0f3 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -187,6 +187,7 @@ def test_to(self):
                 padded, lengths, enforce_sorted=enforce_sorted).cpu()
 
             self.assertIs(a, a.to('cpu'))
+            self.assertIs(a, a.cpu())
             self.assertIs(a, a.to('cpu', dtype=torch.int32))
             self.assertEqual(a.long(), a.to(torch.int64))
 
@@ -194,6 +195,7 @@ def test_to(self):
                 for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
                     b = a.cuda(device=cuda)
                     self.assertIs(b, b.to(cuda))
+                    self.assertIs(b, b.cuda())
                     self.assertEqual(a, b.to('cpu'))
                     self.assertEqual(b, a.to(cuda))
                     self.assertEqual(a, b.to('cpu', dtype=torch.int32))
@@ -8987,7 +8989,7 @@ def test_device_mask(self, device):
             packed = rnn_utils.pack_padded_sequence(
                 padded, lengths, enforce_sorted=enforce_sorted)
             self.assertFalse(packed.is_cuda)
-            packed = packed.to(device=device)
+            packed = packed.to(device)
             self.assertTrue(packed.is_cuda)
             unpacked, _ = rnn_utils.pad_packed_sequence(packed)
             self.assertEqual(unpacked.type(), cuda_type_str)
diff --git a/torch/nn/utils/rnn.py b/torch/nn/utils/rnn.py
index bfb9768a65dff..7a4637c3232c7 100644
--- a/torch/nn/utils/rnn.py
+++ b/torch/nn/utils/rnn.py
@@ -100,95 +100,49 @@ def pin_memory(self):
                           bind(self.unsorted_indices, lambda t: t.pin_memory()))
 
     def cuda(self, *args, **kwargs):
-        """Returns a GPU copy if `self.data` not already on the GPU"""
-        if self.is_cuda:
-            return self
-        else:
-            # Why not convert `batch_sizes`?
-            # See NOTE [ device and dtype of a PackedSequence ]
-            return type(self)(self.data.cuda(*args, **kwargs), self.batch_sizes,
-                              bind(self.sorted_indices, lambda t: t.cuda(*args, **kwargs)),
-                              bind(self.unsorted_indices, lambda t: t.cuda(*args, **kwargs)))
-
-    def cpu(self):
-        """Returns a CPU copy if `self.data` not already on the CPU"""
-        if self.is_cuda:
-            # Why not convert `batch_sizes`?
-            # See NOTE [ device and dtype of a PackedSequence ]
-            return type(self)(self.data.cpu(), self.batch_sizes,
-                              bind(self.sorted_indices, lambda t: t.cpu()),
-                              bind(self.unsorted_indices, lambda t: t.cpu()))
-        else:
-            return self
+        # Tests to see if 'cuda' should be added to kwargs
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(*args, **kwargs)
+        if ex.is_cuda:
+            return self.to(*args, **kwargs)
+        return self.to(*args, device='cuda', **kwargs)
 
-    def double(self):
-        r"""Returns copy with `self.data` cast to double type"""
+    def cpu(self, *args, **kwargs):
 
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.double(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+        ex = torch.tensor((), dtype=self.data.dtype, device=self.data.device).to(*args, **kwargs)
+        if ex.device.type == 'cpu':
+            return self.to(*args, **kwargs)
+        return self.to(*args, device='cpu', **kwargs)
 
-    def float(self):
-        r"""Returns copy with `self.data` cast to float type"""
+    def double(self):
+        return self.to(dtype=torch.double)
 
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.float(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+    def float(self):
+        return self.to(dtype=torch.float)
 
     def half(self):
-        r"""Returns copy with `self.data` cast to half type"""
-
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.half(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+        return self.to(dtype=torch.half)
 
     def long(self):
-        r"""Returns copy with `self.data` cast to long type"""
-
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.long(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+        return self.to(dtype=torch.long)
 
     def int(self):
-        r"""Returns copy with `self.data` cast to int type"""
-
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.int(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+        return self.to(dtype=torch.int)
 
     def short(self):
-        r"""Returns copy with `self.data` cast to short type"""
-
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.short(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+        return self.to(dtype=torch.short)
 
     def char(self):
-        r"""Returns copy with `self.data` cast to char type"""
-
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.char(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+        return self.to(dtype=torch.int8)
 
     def byte(self):
-        r"""Returns copy with `self.data` cast to byte type"""
-
-        # Why not convert `batch_sizes`?
-        # See NOTE [ device and dtype of a PackedSequence ]
-        return type(self)(self.data.byte(), self.batch_sizes,
-                          self.sorted_indices, self.unsorted_indices)
+        return self.to(dtype=torch.uint8)
 
     def to(self, *args, **kwargs):
         r"""Performs dtype and/or device conversion on `self.data`.
 
-        It has similar signature as :meth:`torch.Tensor.to`.
+        It has similar signature as :meth:`torch.Tensor.to`, except optional
+        arguments like `non_blocking` and `copy` should be passed as kwargs,
+        not args, or they will not apply to the index tensors.
 
         .. note::
 
@@ -200,17 +154,14 @@ def to(self, *args, **kwargs):
         # Why not convert `batch_sizes`?
         # See NOTE [ device and dtype of a PackedSequence ]
         data = self.data.to(*args, **kwargs)
-        sorted_indices = self.sorted_indices
-        unsorted_indices = self.unsorted_indices
-        device_kw = 'device'
-        if device_kw in kwargs:
-            sorted_indices = bind(sorted_indices, lambda t: t.to(kwargs[device_kw]))
-            unsorted_indices = bind(unsorted_indices, lambda t: t.to(kwargs[device_kw]))
         if data is self.data:
             return self
         else:
-            return type(self)(data, self.batch_sizes,
-                              sorted_indices, unsorted_indices)
+            # Does not forward device or dtype arg/kwargs, device is set from data.device
+            kwargs = {k : v for k, v in filter(lambda t: t[0] != 'device' and t[0] != 'dtype', kwargs.items())}
+            sorted_indices = bind(self.sorted_indices, lambda t: t.to(data.device, **kwargs))
+            unsorted_indices = bind(self.unsorted_indices, lambda t: t.to(data.device, **kwargs))
+            return type(self)(data, self.batch_sizes, sorted_indices, unsorted_indices)
 
     @property
     def is_cuda(self):

From a7de545c6374cccd27cb871e284f774bc2395bbe Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Fri, 4 Oct 2019 02:39:26 -0700
Subject: [PATCH 049/113] Makes test_cuda.py's generated tensor op tests
 generic (#27210)

Summary:
- The tensor op tests generated in test_cuda.py are now generic and appear in test_torch,py
- Data previously held in auxiliary data structures and files, like test_cuda_ignores.txt, is inlined

Previously the tensor op tests used several auxiliary data structures, a file, and exception handling to filter the test suite. If a function wasn't implemented, for example, that exception would be caught. This let functions like trigamma, which isn't callable, appear to be tested. See https://github.com/pytorch/pytorch/issues/27230. Filtering from additional data stores is error prone, too. It requires developers understand what data stores are used and how they're used. The existing sources are also sometimes incorrect. The txt file claims that dist_ doesn't work on half tensors, for example, but the updated tests verify it does.

In addition to making these tests generic, this PR removes those auxiliary data structures and does not catch any exceptions. Exceptions are errors. (This also means that if something implemented breaks it will now report as an error. Previously the test suite would have reported a pass.) The test infrastructure was also simplified to not perform computations with CPU half tensors since they do not support many operations. This introduces a float<->half conversion quirk but eliminates awkward functions that would first convert cpu tensors to float, perform an operation, and convert them back.

With this change test_cuda.py is almost entirely CUDA-specific.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27210

Differential Revision: D17757907

Pulled By: mruberry

fbshipit-source-id: b3c191c379667b1a7d5361087bdf82f397f77f65
---
 test/common_device_type.py      |   7 +-
 test/data/test_cuda_ignores.txt | 112 ------
 test/test_cuda.py               | 668 +-------------------------------
 test/test_torch.py              | 536 ++++++++++++++++++++++++-
 4 files changed, 539 insertions(+), 784 deletions(-)
 delete mode 100644 test/data/test_cuda_ignores.txt

diff --git a/test/common_device_type.py b/test/common_device_type.py
index 6f765ebf211c9..b3352182c643d 100644
--- a/test/common_device_type.py
+++ b/test/common_device_type.py
@@ -148,8 +148,8 @@ def _get_dtypes(cls, test):
 
     # Creates device-specific tests.
     @classmethod
-    def instantiate_test(cls, test):
-        test_name = test.__name__ + "_" + cls.device_type
+    def instantiate_test(cls, name, test):
+        test_name = name + "_" + cls.device_type
 
         dtypes = cls._get_dtypes(test)
         if dtypes is None:  # Test has no dtype variants
@@ -255,7 +255,6 @@ def instantiate_device_type_tests(generic_test_class, scope, except_for=None):
 
         for name in generic_members:
             if name in generic_tests:  # Instantiates test member
-
                 # Requires tests be a function for Python2 compat
                 # (In Python2 tests are type checked methods wrapping functions)
                 test = getattr(generic_test_class, name)
@@ -264,7 +263,7 @@ def instantiate_device_type_tests(generic_test_class, scope, except_for=None):
                 assert inspect.isfunction(test), "Couldn't extract function from '{0}'".format(name)
 
                 # Instantiates the device-specific tests
-                device_type_test_class.instantiate_test(test)
+                device_type_test_class.instantiate_test(name, test)
             else:  # Ports non-test member
                 assert not hasattr(device_type_test_class, name), "Redefinition of non-test member {0}".format(name)
 
diff --git a/test/data/test_cuda_ignores.txt b/test/data/test_cuda_ignores.txt
deleted file mode 100644
index afad1c57906e5..0000000000000
--- a/test/data/test_cuda_ignores.txt
+++ /dev/null
@@ -1,112 +0,0 @@
-# List of functions that are not implemented on CUDA tensors
-# These are skipped by test_cuda.py
-torch.ByteTensor.dist
-torch.ByteTensor.dot
-torch.ByteTensor.lerp
-torch.ByteTensor.lerp_
-torch.ByteTensor.mean
-torch.ByteTensor.norm
-torch.ByteTensor.renorm
-torch.ByteTensor.renorm_
-torch.ByteTensor.std
-torch.ByteTensor.var
-torch.CharTensor.dist
-torch.CharTensor.dot
-torch.CharTensor.lerp
-torch.CharTensor.lerp_
-torch.CharTensor.mean
-torch.CharTensor.norm
-torch.CharTensor.renorm
-torch.CharTensor.renorm_
-torch.CharTensor.std
-torch.CharTensor.var
-torch.HalfTensor.chunk_
-torch.HalfTensor.clone_
-torch.HalfTensor.contiguous_
-torch.HalfTensor.cross_
-torch.HalfTensor.cumprod_
-torch.HalfTensor.cumsum_
-torch.HalfTensor.dim_
-torch.HalfTensor.dist_
-torch.HalfTensor.dot_
-torch.HalfTensor.element_size_
-torch.HalfTensor.equal_
-torch.HalfTensor.expand_
-torch.HalfTensor.expand_as_
-torch.HalfTensor.eye
-torch.HalfTensor.eye_
-torch.HalfTensor.fill
-torch.HalfTensor.geqrf
-torch.HalfTensor.geqrf_
-torch.HalfTensor.inverse
-torch.HalfTensor.inverse_
-torch.HalfTensor.is_contiguous_
-torch.HalfTensor.is_same_size_
-torch.HalfTensor.is_set_to_
-torch.HalfTensor.kthvalue_
-torch.HalfTensor.max_
-torch.HalfTensor.mean_
-torch.HalfTensor.min_
-torch.HalfTensor.mode_
-torch.HalfTensor.narrow_
-torch.HalfTensor.ndimension_
-torch.HalfTensor.nelement_
-torch.HalfTensor.nonzero_
-torch.HalfTensor.norm_
-torch.HalfTensor.numel_
-torch.HalfTensor.ones
-torch.HalfTensor.ones_
-torch.HalfTensor.permute_
-torch.HalfTensor.prod_
-torch.HalfTensor.put__
-torch.HalfTensor.qr
-torch.HalfTensor.qr_
-torch.HalfTensor.repeat_
-torch.HalfTensor.size_
-torch.HalfTensor.sort_
-torch.HalfTensor.split_
-torch.HalfTensor.std_
-torch.HalfTensor.sum_
-torch.HalfTensor.take_
-torch.HalfTensor.to_list
-torch.HalfTensor.to_list_
-torch.HalfTensor.topk_
-torch.HalfTensor.trace_
-torch.HalfTensor.trigamma
-torch.HalfTensor.trigamma_
-torch.HalfTensor.var_
-torch.HalfTensor.view_
-torch.HalfTensor.view_as_
-torch.HalfTensor.zero
-torch.HalfTensor.zeros
-torch.HalfTensor.zeros_
-torch.IntTensor.dist
-torch.IntTensor.dot
-torch.IntTensor.lerp
-torch.IntTensor.lerp_
-torch.IntTensor.mean
-torch.IntTensor.norm
-torch.IntTensor.renorm
-torch.IntTensor.renorm_
-torch.IntTensor.std
-torch.IntTensor.var
-torch.LongTensor.dist
-torch.LongTensor.dot
-torch.LongTensor.lerp
-torch.LongTensor.lerp_
-torch.LongTensor.mean
-torch.LongTensor.norm
-torch.LongTensor.renorm
-torch.LongTensor.renorm_
-torch.LongTensor.std
-torch.LongTensor.var
-torch.ShortTensor.dist
-torch.ShortTensor.dot
-torch.ShortTensor.lerp
-torch.ShortTensor.lerp_
-torch.ShortTensor.mean
-torch.ShortTensor.norm
-torch.ShortTensor.renorm
-torch.ShortTensor.renorm_
-torch.ShortTensor.std
-torch.ShortTensor.var
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 4842e434e8237..3ce95bfaf1e58 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -21,9 +21,9 @@
 
 from common_methods_invocations import tri_tests_args, tri_large_tests_args, \
     _compare_trilu_indices, _compare_large_trilu_indices
-from common_utils import TestCase, get_gpu_type, to_gpu, freeze_rng_state, run_tests, \
+from common_utils import TestCase, get_gpu_type, freeze_rng_state, run_tests, \
     PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, skipIfRocm, \
-    TEST_WITH_ROCM, load_tests, slowTest, skipCUDANonDefaultStreamIf
+    load_tests, slowTest, skipCUDANonDefaultStreamIf
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -49,25 +49,6 @@
     TEST_LARGE_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 12e9
     TEST_MEDIUM_TENSOR = torch.cuda.get_device_properties(0).total_memory >= 6e9
 
-floating_set = {torch.FloatTensor, torch.DoubleTensor, torch.cuda.FloatTensor,
-                torch.cuda.DoubleTensor, torch.HalfTensor, torch.cuda.HalfTensor}
-
-
-def is_floating(t):
-    if not isinstance(t, type):
-        raise TypeError('t should be an instance of type')
-    assert t != torch.autograd.Variable
-    return t in floating_set
-
-
-def is_half(t):
-    if isinstance(t, torch.Tensor):
-        return t.dtype == torch.float16
-    assert isinstance(t, type)
-    assert t != torch.autograd.Variable
-    return t in [torch.HalfTensor, torch.cuda.HalfTensor]
-
-
 types = [
     torch.FloatTensor,
     torch.DoubleTensor,
@@ -79,54 +60,6 @@ def is_half(t):
     torch.HalfTensor,
 ]
 
-signed_types = [
-    torch.FloatTensor,
-    torch.DoubleTensor,
-    torch.LongTensor,
-    torch.IntTensor,
-    torch.ShortTensor,
-    torch.CharTensor,
-]
-
-unsigned_types = [
-    torch.ByteTensor,
-]
-
-float_types = [
-    torch.FloatTensor,
-    torch.DoubleTensor,
-    torch.HalfTensor,
-]
-
-float_types_no_half = [
-    torch.FloatTensor,
-    torch.DoubleTensor,
-]
-
-
-def number(floating, integer, t):
-    return floating if is_floating(t) else integer
-
-
-def cast_tensor(tensor, t):
-    return t(tensor.size()).copy_(tensor)
-
-S = 10
-M = 50
-G = 275000000
-
-
-def make_tensor(t, *sizes):
-    if 'Half' in t.__name__:
-        return t(*sizes).copy_(torch.randn(*sizes))
-    else:
-        tensor = t(*sizes)
-        if tensor.is_floating_point():
-            return tensor.normal_()
-        else:
-            return tensor.random_(0, 10)
-
-
 def make_sparse_tensor(t, n, *sizes):
     assert t.is_sparse
     tensor = t()
@@ -137,478 +70,8 @@ def make_sparse_tensor(t, n, *sizes):
     v = v.new(n).copy_(torch.randn(n))
     return t(i, v, torch.Size(sizes))
 
-
-def tensor_clamp(t, min, max):
-    if is_half(t):
-        return t.float().clamp(min, max).half()
-    else:
-        return t.clamp(min, max)
-
-
-def tensor_mul(t, scale):
-    if is_half(t):
-        return t.float().mul(scale).half()
-    else:
-        return t.mul(scale)
-
-
-def tensor_abs_(t):
-    if is_half(t):
-        return t.float().abs_().half()
-    else:
-        return t.abs_()
-
-
-def constant_tensor_sub(a, b):
-    # helper function to address const - torch.HalfTensor where it doesn't
-    # have resize_as()
-    if is_half(b):
-        return (a - b.float()).half()
-    else:
-        return a - b
-
-
-def constant_tensor_add(a, b):
-    # helper function to address const + torch.HalfTensor where it doesn't
-    # have add()
-    if is_half(b):
-        return (a + b.float()).half()
-    else:
-        return a + b
-
-
-def small_0d(t):
-    return make_tensor(t, (1,)).squeeze()
-
-
-def small_2d(t):
-    return make_tensor(t, S, S)
-
-
-def small_2d_scaled(t, scale=10):
-    return tensor_mul(make_tensor(t, S, S), scale)
-
-
-def small_2d_oneish(t):
-    if is_floating(t):
-        return tensor_clamp(make_tensor(t, S, S), min=0.99, max=1.01)
-    else:
-        return t(S, S).fill_(1)
-
-
-def small_3d(t):
-    return make_tensor(t, S, S, S)
-
-
-def medium_1d(t):
-    return make_tensor(t, M)
-
-
-def medium_2d(t):
-    return make_tensor(t, M, M)
-
-
-def medium_2d_expanded(t):
-    return t(1).expand(M, M)
-
-
-def medium_2d_scaled(t, scale=10):
-    return tensor_mul(make_tensor(t, M, M), scale)
-
-
-def small_3d_ones(t):
-    return t(S, S, S).copy_(torch.ones(S, S, S))
-
-
-def small_3d_positive(t):
-    # In div_tensor(), half cannot achieve float precision
-    min_val = 1e-3 if is_floating(t) and not is_half(t) else 2
-    return tensor_clamp(make_tensor(t, S, S, S), min_val, 120)
-
-
-def small_3d_unique(t):
-    return t(S, S, S).copy_(torch.arange(1, S * S * S + 1).view(S, S, S))
-
-
-def small_1d_lapack(t):
-    return t(1, 3).copy_(torch.arange(1, 4).view(3))
-
-
-def small_2d_lapack(t):
-    return t(3, 3).copy_(torch.arange(1, 10).view(3, 3))
-
-
-def small_2d_lapack_skinny(t):
-    return t(3, 4).copy_(torch.arange(1, 13).view(3, 4))
-
-
-def small_2d_lapack_fat(t):
-    return t(4, 3).copy_(torch.arange(1, 13).view(4, 3))
-
-
-def large_2d_lapack(t):
-    return t(1000, 1000).normal_()
-
-
-def giant_1d_ones(t):
-    return t(G).copy_(torch.ones(G))
-
-
-def long_type(t):
-    return torch.cuda.LongTensor if 'cuda' in t.__module__ else torch.LongTensor
-
-
-def new_t(*sizes):
-    def tmp(t):
-        return t(*sizes).copy_(torch.randn(*sizes))
-    return tmp
-
-# Content of each tuple:
-# - function name
-# - constructor for the tensor,    signature: fn(tensor_type) -> tensor
-# - constructor for the arguments, signature: fn(tensor_type) -> list
-# - postfix name for the test (must be unique for a given function) (default='')
-# - tensor types to use (default=types)
-# - disable inplace test, if set to True, no inplace test will be done (default=False)
-# - decorator, e.g., unittest.skipIf (default is no decorator)
-tests = [
-    ('add', small_3d, lambda t: [number(3.14, 3, t)]),
-    ('add', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('add', small_3d, lambda t: [number(0.2, 2, t), small_3d_positive(t)], 'scalar_tensor'),
-    ('sub', small_3d, lambda t: [number(3.14, 3, t)]),
-    ('sub', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('mul', small_3d, lambda t: [number(3.14, 3, t)]),
-    ('mul', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('mul', small_0d, lambda t: [small_0d(torch.IntTensor)], 'scalar', types, True),
-    ('div', small_3d, lambda t: [number(3.14, 3, t)]),
-    ('div', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('pow', small_3d, lambda t: [number(3.14, 3, t)], None, float_types),
-    ('pow', small_3d, lambda t: [number(1., 1, t)], 'pow1'),
-    ('pow', small_3d, lambda t: [number(2., 2, t)], 'pow2'),
-    ('pow', small_3d, lambda t: [number(3., 3, t)], 'pow3'),
-    ('pow', small_3d, lambda t: [number(-1., -1, t)], 'pow-1', float_types),
-    # HalfTensor gives bad result at pow-2 with data sampled from torch.randn
-    ('pow', small_3d, lambda t: [number(-2., -2, t)], 'pow-2', float_types_no_half, False,
-        "skipIfRocm:FloatTensor"),
-    ('pow', small_3d, lambda t: [tensor_abs_(small_3d(t))], 'tensor', float_types),
-    ('addbmm', small_2d, lambda t: [small_3d(t), small_3d(t)], None, float_types),
-    ('addbmm', small_2d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
-    ('addbmm', small_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
-    ('baddbmm', small_3d, lambda t: [small_3d(t), small_3d(t)],),
-    ('baddbmm', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
-    ('baddbmm', small_3d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), small_3d(t), small_3d(t)], 'two_scalars'),
-    ('bmm', small_3d, lambda t: [small_3d(t)], '', float_types_no_half),
-    ('addcdiv', small_2d_lapack, lambda t: [tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)]),
-    ('addcdiv', small_2d_lapack, lambda t: [number(2.8, 1, t), tensor_mul(small_2d_lapack(t), 2), small_2d_lapack(t)],
-        'scalar'),
-    ('addcmul', small_3d, lambda t: [small_3d(t), small_3d(t)]),
-    ('addcmul', small_3d, lambda t: [number(0.4, 2, t), small_3d(t), small_3d(t)], 'scalar'),
-    ('addmm', medium_2d, lambda t: [medium_2d(t), medium_2d(t)]),
-    ('addmm', medium_2d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'scalar'),
-    ('addmm', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_2d(t)], 'two_scalars'),
-    ('addmv', medium_1d, lambda t: [medium_2d(t), medium_1d(t)],),
-    ('addmv', medium_1d, lambda t: [number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'scalar'),
-    ('addmv', medium_1d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_2d(t), medium_1d(t)], 'two_scalars'),
-    ('addr', medium_2d, lambda t: [medium_1d(t), medium_1d(t)]),
-    ('addr', medium_2d, lambda t: [number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'scalar'),
-    ('addr', medium_2d, lambda t: [number(0.5, 3, t), number(0.4, 2, t), medium_1d(t), medium_1d(t)], 'two_scalars'),
-    ('atan2', medium_2d, lambda t: [medium_2d(t)], None, float_types + [torch.HalfTensor]),
-    ('fmod', small_3d, lambda t: [3], 'value',),
-    ('fmod', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('chunk', medium_2d, lambda t: [4],),
-    ('chunk', medium_2d, lambda t: [4, 1], 'dim'),
-    ('chunk', medium_2d, lambda t: [4, -2], 'neg_dim'),
-    ('clamp', medium_2d_scaled, lambda t: [-1, 5], None, signed_types),
-    ('clamp', medium_2d_scaled, lambda t: [1, 5], None, unsigned_types),
-    ('clone', medium_2d, lambda t: [],),
-    ('contiguous', medium_2d, lambda t: [],),
-    ('cross', new_t(M, 3, M), lambda t: [new_t(M, 3, M)(t)],),
-    ('cumprod', small_3d, lambda t: [1]),
-    ('cumprod', small_3d, lambda t: [-1], 'neg_dim'),
-    ('cumsum', small_3d, lambda t: [1]),
-    ('cumsum', small_3d, lambda t: [-1], 'neg_dim'),
-    ('dim', small_3d, lambda t: [],),
-    ('dist', small_2d, lambda t: [small_2d(t)]),
-    ('dist', small_2d, lambda t: [small_2d(t), 3], '3_norm'),
-    ('dist', small_2d, lambda t: [small_2d(t), 2.5], '2_5_norm'),
-    ('dot', medium_1d, lambda t: [medium_1d(t)], '', types, False, "skipIfRocm:HalfTensor"),
-    ('element_size', medium_1d, lambda t: [],),
-    ('eq', small_3d_ones, lambda t: [small_3d(t)],),
-    ('eq', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
-    ('ne', small_3d_ones, lambda t: [small_3d(t)],),
-    ('ne', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
-    ('equal', small_3d_ones, lambda t: [small_3d_ones(t)], 'equal'),
-    ('equal', small_3d_ones, lambda t: [small_3d(t)],),
-    ('expand', new_t(M, 1, M), lambda t: [M, 4, M],),
-    ('expand_as', new_t(M, 1, M), lambda t: [new_t(M, 4, M)(t)],),
-    ('fill', medium_2d, lambda t: [number(3.14, 3, t)]),
-    ('ge', medium_2d, lambda t: [medium_2d(t)],),
-    ('le', medium_2d, lambda t: [medium_2d(t)],),
-    ('gt', medium_2d, lambda t: [medium_2d(t)],),
-    ('lt', medium_2d, lambda t: [medium_2d(t)],),
-    ('is_contiguous', medium_2d, lambda t: [],),
-    # TODO: can't check negative case - GPU copy will be contiguous
-    ('is_same_size', medium_2d, lambda t: [small_3d(t)], 'negative'),
-    ('is_same_size', medium_2d, lambda t: [medium_2d(t)], 'positive'),
-    ('is_set_to', medium_2d, lambda t: [medium_2d(t)],),
-    # TODO: positive case
-    ('kthvalue', small_3d_unique, lambda t: [3],),
-    ('kthvalue', small_3d_unique, lambda t: [3, 1], 'dim'),
-    ('kthvalue', small_3d_unique, lambda t: [3, -1], 'neg_dim'),
-    ('lerp', small_3d, lambda t: [small_3d(t), 0.3]),
-    ('max', small_3d_unique, lambda t: []),
-    ('max', small_3d_unique, lambda t: [1], 'dim'),
-    ('max', small_3d_unique, lambda t: [-1], 'neg_dim'),
-    ('max', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
-    ('min', small_3d_unique, lambda t: []),
-    ('min', small_3d_unique, lambda t: [1], 'dim'),
-    ('min', small_3d_unique, lambda t: [-1], 'neg_dim'),
-    ('min', medium_2d, lambda t: [medium_2d(t)], 'elementwise'),
-    ('mean', small_3d, lambda t: []),
-    ('mean', small_3d, lambda t: [-1], 'neg_dim'),
-    ('mean', small_3d, lambda t: [1], 'dim'),
-    ('mean', giant_1d_ones, lambda t: [], '64bit_indexing',
-        # Double here because otherwise the CPU result will be
-        # wrong.
-        [torch.DoubleTensor]),
-    ('mode', small_3d, lambda t: []),
-    ('mode', small_3d, lambda t: [1], 'dim'),
-    ('mode', small_3d, lambda t: [-1], 'neg_dim'),
-    ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.1, 10), lambda t: [1], '2d_p=1', float_types_no_half),
-    ('mvlgamma', lambda t: tensor_clamp(small_2d(t), 0.6, 10), lambda t: [2], '2d_p=2', float_types_no_half),
-    ('remainder', small_3d, lambda t: [3], 'value',),
-    ('remainder', small_3d, lambda t: [-3], 'negative_value', signed_types),
-    ('remainder', small_3d, lambda t: [small_3d_positive(t)], 'tensor'),
-    ('remainder', small_3d, lambda t: [constant_tensor_sub(0, small_3d_positive(t))], 'negative_tensor', signed_types),
-    ('std', small_3d, lambda t: []),
-    ('std', small_3d, lambda t: [1], 'dim', types, False),
-    ('std', small_3d, lambda t: [-1], 'neg_dim', types, False),
-    ('var', small_3d, lambda t: []),
-    ('var', small_3d, lambda t: [1], 'dim'),
-    ('var', small_3d, lambda t: [-1], 'neg_dim'),
-    ('ndimension', small_3d, lambda t: [],),
-    ('nelement', small_3d, lambda t: [],),
-    ('numel', small_3d, lambda t: [],),
-    ('narrow', small_3d, lambda t: [1, 3, 2],),
-    ('narrow', small_3d, lambda t: [-1, 3, 2], 'neg_dim'),
-    ('nonzero', small_3d, lambda t: [], '', types, False),
-    ('norm', small_3d, lambda t: []),
-    ('norm', small_3d, lambda t: [3], '3_norm'),
-    ('norm', small_3d, lambda t: [3, 0], '3_norm_dim'),
-    ('norm', small_3d, lambda t: [3, -2], '3_norm_neg_dim'),
-    ('ones', small_3d, lambda t: [1, 2, 3, 4, 5],),
-    ('permute', new_t(1, 2, 3, 4), lambda t: [2, 1, 3, 0],),
-    ('put_', new_t(2, 5, 3), lambda t: [long_type(t)([[0], [-2]]), t([[3], [4]])], '', types, False),
-    ('put_', new_t(2, 3), lambda t: [long_type(t)([]), t([])], 'empty'),
-    ('put_', new_t(2, 2), lambda t: [long_type(t)([[1], [-3]]), t([[1], [2]]), True], 'accumulate'),
-    ('prod', small_2d_oneish, lambda t: []),
-    ('prod', small_3d, lambda t: [1], 'dim'),
-    ('prod', small_3d, lambda t: [-1], 'neg_dim'),
-    ('sum', small_2d, lambda t: []),
-    ('sum', small_3d, lambda t: [1], 'dim'),
-    ('sum', small_3d, lambda t: [-1], 'neg_dim'),
-    ('renorm', small_3d, lambda t: [2, 1, 1], '2_norm'),
-    ('renorm', small_3d, lambda t: [2, -1, 1], '2_norm_neg_dim'),
-    ('renorm', small_3d, lambda t: [1.5, 1, 1], '1_5_norm'),
-    ('repeat', small_2d, lambda t: [2, 2, 2],),
-    ('size', new_t(1, 2, 3, 4), lambda t: [],),
-    ('size', new_t(1, 2, 3, 4), lambda t: [1], 'dim'),
-    ('size', new_t(1, 2, 3, 4), lambda t: [-2], 'neg_dim'),
-    ('sort', small_3d_unique, lambda t: [], ''),
-    ('sort', small_3d_unique, lambda t: [1], 'dim'),
-    ('sort', small_3d_unique, lambda t: [-1], 'neg_dim'),
-    ('sort', small_3d_unique, lambda t: [1, True], 'dim_descending'),
-    ('sort', small_3d_unique, lambda t: [-1, True], 'neg_dim_descending'),
-    ('split', small_3d, lambda t: [2],),
-    ('split', small_3d, lambda t: [2, 1], 'dim'),
-    ('split', small_3d, lambda t: [2, -3], 'neg_dim'),
-    ('squeeze', new_t(1, 2, 1, 4), lambda t: [],),
-    ('squeeze', new_t(1, 2, 1, 4), lambda t: [2], 'dim'),
-    ('squeeze', new_t(1, 2, 1, 4), lambda t: [-2], 'neg_dim'),
-    ('t', new_t(1, 2), lambda t: [],),
-    ('take', new_t(3, 4), lambda t: [long_type(t)([[0], [-2]])], '', types, False),
-    ('transpose', new_t(1, 2, 3, 4), lambda t: [1, 2],),
-    ('transpose', new_t(1, 2, 3, 4), lambda t: [-1, -2], 'neg_dim'),
-    ('to_list', small_3d, lambda t: [],),
-    ('topk', small_3d_unique, lambda t: [2, 1, False, True], 'dim_sort',),
-    ('topk', small_3d_unique, lambda t: [2, -1, False, True], 'neg_dim_sort',),
-    ('topk', small_3d_unique, lambda t: [2, 1, True, True], 'dim_desc_sort',),
-    ('trace', medium_2d, lambda t: []),
-    ('tril', medium_2d, lambda t: [],),
-    ('tril', medium_2d_expanded, lambda t: [], 'zero_stride', types, True),
-    ('tril', medium_2d, lambda t: [2], 'positive'),
-    ('tril', medium_2d, lambda t: [-2], 'negative'),
-    ('triu', medium_2d, lambda t: [],),
-    ('triu', medium_2d_expanded, lambda t: [], 'zero_stride', types, True),
-    ('triu', medium_2d, lambda t: [2], 'positive'),
-    ('triu', medium_2d, lambda t: [-2], 'negative'),
-    ('unsqueeze', new_t(2, 3, 4), lambda t: [2],),
-    ('unsqueeze', new_t(2, 3, 4), lambda t: [-2], 'neg_dim'),
-    ('view', small_3d, lambda t: [100, 10], 'contiguous'),
-    ('view_as', small_3d, lambda t: [make_tensor(t, 100, 10)],),
-    ('zero', small_3d, lambda t: [],),
-    ('zeros', small_3d, lambda t: [1, 2, 3, 4],),
-    ('eye', small_2d, lambda t: [3, 4],),
-    ('flip', small_3d, lambda t: [0], 'd0', types, True),
-    ('flip', small_3d, lambda t: [0, 1, 2], 'd012', types, True),
-    ('flip', small_3d, lambda t: [0, 2], 'd02', types, True),
-    ('flip', small_3d, lambda t: [2, 0], 'd20', types, True),
-    ('flip', small_3d, lambda t: [-1], 'neg_d', types, True),
-    ('rot90', small_2d, lambda t: [1, [0, 1]], 'k1_d01', types, True),
-    ('rot90', small_3d, lambda t: [1, [1, 2]], 'k1_d12', types, True),
-    ('rot90', small_3d, lambda t: [1, [1, -1]], 'k1_neg_d', types, True),
-    ('rot90', small_3d, lambda t: [], 'default', types, True),
-    ('rsqrt', lambda t: constant_tensor_add(1, small_3d(t)), lambda t: [], None, float_types),
-    ('sinh', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
-    ('tan', lambda t: tensor_clamp(small_3d(t), -1, 1), lambda t: [], None, float_types),
-    ('__lshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(1, 5), t)),
-        lambda t: [2], None, signed_types),
-    ('__rshift__', lambda t: torch.pow(2, cast_tensor(torch.arange(3, 7), t)),
-        lambda t: [2], None, signed_types),
-    # lapack tests
-    ('qr', small_2d_lapack, lambda t: [], 'square', float_types, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('qr', small_2d_lapack_skinny, lambda t: [], 'skinny', float_types, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('qr', small_2d_lapack_fat, lambda t: [], 'fat', float_types, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('qr', large_2d_lapack, lambda t: [], 'big', float_types, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('geqrf', new_t(20, 20), lambda t: [], None, float_types, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('svd', new_t(10, 10), lambda t: [], 'square', float_types_no_half, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('svd', lambda t: new_t(10, 10)(t).t(), lambda t: [True], 'square_col_maj',
-        float_types_no_half, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('svd', new_t(20, 5), lambda t: [True], 'tall_some', float_types_no_half, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('svd', new_t(20, 5), lambda t: [False], 'tall_all', float_types_no_half, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [True],
-        'tall_some_col_maj', float_types_no_half, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('svd', lambda t: new_t(5, 20)(t).t(), lambda t: [False],
-        'tall_all_col_maj', float_types_no_half, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-    ('eig', new_t(10, 10), lambda t: [True], 'with_eigvec', float_types_no_half, False,
-        unittest.skipIf(not TEST_MAGMA, "no MAGMA library detected")),
-]
-
-# TODO: random functions, cat, gather, scatter, index*, masked*,
-#       resize, resizeAs, storage_offset, storage, stride, unfold
-
-custom_precision = {
-    'addbmm': 1e-4,
-    'addmm': 1e-4,
-    'addmv': 1e-4,
-    'addr': 1e-4,
-    'baddbmm': 1e-4,
-    'rsqrt': 1e-4,
-    'cumprod': 1e-4,
-    'qr': 3e-4,
-    'digamma': 1e0,  # large values lead to large absolute error but small relative error
-}
-
-custom_half_precision = {
-    'add': 1e-2,
-    'acos': 1e-3,
-    'addbmm': 1e-1,
-    'addcdiv': 1e-2,
-    'addcmul': 1e-2,
-    'addmm': 1e-1,
-    'addmv': 1e-2,
-    'addr': 1e-2,
-    'asin': 1e-3,
-    'atan2': 1e-3,
-    'atan': 1e-3,
-    'baddbmm': 1e-2,
-    'cos': 1e-3,
-    'cosh': 1e-2,
-    'cross': 1e-2,
-    'cumprod': 1e-2,
-    'cumsum': 1e-2,
-    'dist': 1e-2,
-    'div': 1e-3,
-    'dot': 1e-2,
-    'erf': 1e-3,
-    'erfc': 1e-3,
-    'exp': 1e-2,
-    'expm1': 1e-2,
-    'fill': 1e-3,
-    'lerp': 1e-2,
-    'lgamma': 1e-2,
-    'log': 1e-2,
-    'log10': 1e-2,
-    'log1p': 1e-3,
-    'log2': 1e-2,
-    'mean': 1e-3,
-    'mul': 1e-2,
-    'norm': 1e-1,
-    'pow': 1e-1,
-    'prod': 1e-3,
-    'reciprocal': 1e-1,
-    'remainder': 1e-3,
-    'renorm': 1e-3,
-    'rsqrt': 1e-2,
-    'sigmoid': 1e-3,
-    'sin': 1e-3,
-    'sinh': 1e-3,
-    'sqrt': 1e-3,
-    'std': 1e-3,
-    'sub': 1e-2,
-    'sum': 1e-2,
-    'tan': 1e-3,
-    'tanh': 1e-3,
-    'trace': 1e-3,
-    'var': 1e-3,
-    '__lshift__': 1e-3,
-    '__rshift__': 1e-3,
-}
-
-simple_pointwise = [
-    'abs',
-    'sign',
-]
-for fn in simple_pointwise:
-    tests.append((fn, small_3d, lambda t: []))
-
-simple_pointwise_float = [
-    'log',
-    'log10',
-    'log1p',
-    'log2',
-    'sigmoid',
-    'sin',
-    'sqrt',
-    'tanh',
-    'acos',
-    'asin',
-    'atan',
-    'cos',
-    'cosh',
-    'erf',
-    'erfc',
-    'exp',
-    'expm1',
-    'reciprocal',
-    'floor',
-    'frac',
-    'neg',
-    'round',
-    'trunc',
-    'ceil',
-    'lgamma',
-    'digamma',
-    'trigamma',
-]
-
-for fn in simple_pointwise_float:
-    tests.append((fn, small_3d, lambda t: [], None, float_types))
-
 _cycles_per_ms = None
 
-
 def get_cycles_per_ms():
     """Approximate number of cycles per millisecond for torch.cuda._sleep"""
     global _cycles_per_ms
@@ -623,43 +86,6 @@ def get_cycles_per_ms():
     return _cycles_per_ms
 
 
-def compare_cpu_gpu(tensor_constructor, arg_constructor, fn, t, precision=1e-5):
-    def tmp(self):
-        cpu_tensor = tensor_constructor(t)
-        gpu_tensor = to_gpu(cpu_tensor)
-        cpu_args = arg_constructor(t)
-        gpu_args = [to_gpu(arg) for arg in cpu_args]
-        if is_half(t):
-            cpu_tensor = cpu_tensor.float()
-            cpu_args = [arg.float() if isinstance(arg, torch.Tensor) and is_half(arg) else arg for arg in cpu_args]
-        cpu_result = getattr(cpu_tensor, fn)(*cpu_args)
-        try:
-            gpu_result = getattr(gpu_tensor, fn)(*gpu_args)
-        except RuntimeError as e:
-            reason = e.args[0]
-            data_type_reasons = {'only supports floating-point types',
-                                 'unimplemented data type',
-                                 'not implemented for'}
-            if any(data_type_reason in reason for data_type_reason in data_type_reasons):
-                raise unittest.SkipTest('unimplemented data type')
-            raise
-        except AttributeError as e:
-            reason = e.args[0]
-            if 'object has no attribute' in reason:
-                raise unittest.SkipTest('unimplemented data type')
-            raise
-        # If one changes, another should change as well
-        self.assertEqual(cpu_tensor, gpu_tensor, precision)
-        self.assertEqual(cpu_args, gpu_args, precision)
-        # Compare results
-        if fn == 'element_size' and t.__name__ == 'HalfTensor':
-            # Workaround since cpu_result is float
-            self.assertEqual(2, gpu_result)
-        else:
-            self.assertEqual(cpu_result, gpu_result, precision)
-    return tmp
-
-
 class TestCuda(TestCase):
     _do_cuda_memory_leak_check = True
     _do_cuda_non_default_stream = True
@@ -2147,6 +1573,7 @@ def _test_multinomial_invalid_probs_cuda(probs):
         except RuntimeError as e:
             return e
 
+    @slowTest
     @unittest.skipIf(NO_MULTIPROCESSING_SPAWN, "Disabled for environments that \
                      don't support multiprocessing with spawn start method")
     @unittest.skipIf(IS_WINDOWS, 'FIXME: CUDA OOM error on Windows')
@@ -2406,94 +1833,5 @@ def worker(rank):
 """])
 
 
-def load_ignore_file():
-    from os.path import join, dirname
-    global ignores
-    path = join(dirname(__file__), 'data', 'test_cuda_ignores.txt')
-    with open(path, 'r') as f:
-        ignores = {l for l in f.read().splitlines() if not l.startswith('#')}
-
-
-def generate_tests():
-    for decl in tests:
-        for t in types:
-            tensor = t()
-
-            # Default values
-            desc = ''
-            type_subset = types
-            no_inplace = False
-            decorator = None
-            if len(decl) == 3:
-                name, constr, arg_constr = decl
-            elif len(decl) == 4:
-                name, constr, arg_constr, desc = decl
-            elif len(decl) == 5:
-                name, constr, arg_constr, desc, type_subset = decl
-            elif len(decl) == 6:
-                name, constr, arg_constr, desc, type_subset, no_inplace = decl
-            elif len(decl) == 7:
-                name, constr, arg_constr, desc, type_subset, no_inplace, decorator = decl
-
-            if t not in type_subset:
-                continue
-            if TEST_WITH_ROCM and decorator is not None:
-                if (isinstance(decorator, str)):
-                    tensor_type_name = str(t.__name__)
-                    decorator_list = decorator.split(":")
-                    skip_type_list = decorator_list[1].split(",")
-                    if (("ByteTensor" in skip_type_list) and tensor_type_name == "ByteTensor") \
-                            or (("CharTensor" in skip_type_list) and tensor_type_name == "CharTensor") \
-                            or (("DoubleTensor" in skip_type_list) and tensor_type_name == "DoubleTensor") \
-                            or (("FloatTensor" in skip_type_list) and tensor_type_name == "FloatTensor") \
-                            or (("HalfTensor" in skip_type_list) and tensor_type_name == "HalfTensor") \
-                            or (("IntTensor" in skip_type_list) and tensor_type_name == "IntTensor") \
-                            or (("LongTensor" in skip_type_list) and tensor_type_name == "LongTensor") \
-                            or (("ShortTensor" in skip_type_list) and tensor_type_name == "ShortTensor"):
-                        decorator = skipIfRocm
-                    else:
-                        decorator = None
-            elif ((not TEST_WITH_ROCM) and (decorator is not None)):
-                if (isinstance(decorator, str)):
-                    decorator = None
-
-            precision = custom_precision.get(name, TestCuda.precision)
-            if is_half(t):
-                precision = custom_half_precision.get(name, precision)
-
-            for inplace in (True, False):
-                if inplace and no_inplace:
-                    continue
-                if inplace:
-                    name_inner = name + '_'
-                else:
-                    name_inner = name
-
-                if t != torch.HalfTensor and not hasattr(tensor, name_inner):
-                    # torch.HalfTensor doesn't support most operations,
-                    # but we use torch.FloatTensor as cpu baseline
-                    continue
-                full_name = '{}.{}'.format(tensor.type(), name_inner)
-                if full_name in ignores:
-                    continue
-
-                test_name = 'test_' + t.__name__ + '_' + name_inner
-                if desc:
-                    test_name += '_' + desc
-
-                assert not hasattr(TestCuda, test_name), "Duplicated test name: " + test_name
-
-                test_fn = compare_cpu_gpu(constr, arg_constr, name_inner, t, precision)
-
-                if decorator is not None:
-                    test_fn = decorator(test_fn)
-
-                setattr(TestCuda, test_name, test_fn)
-
-
 if __name__ == '__main__':
-    if TEST_CUDA:
-        load_ignore_file()
-        generate_tests()
-
     run_tests()
diff --git a/test/test_torch.py b/test/test_torch.py
index 7f73a3efe2b4a..18be60b363324 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -13347,12 +13347,542 @@ def test_ones_like_multiple_device(self, devices):
         self.assertEqual(output, expected)
 
 
-add_neg_dim_tests()
-instantiate_device_type_tests(TestTorchDeviceType, globals())
-instantiate_device_type_tests(TestDevicePrecision, globals(), except_for='cpu')
+# Below are fixtures and functions that generate tensor op comparison tests
+# These tests run a single op on both a CPU and device tensor and compare the
+# the results. In-place variants of the ops can also be run.
+
+# Lists of dtypes to instantiate tensor op test variants.
+_types = [
+    torch.half, torch.float, torch.double,
+    torch.int8, torch.short, torch.int, torch.long,
+    torch.uint8
+]
+
+_float_types = [torch.half, torch.float, torch.double]
+
+_float_types_no_half = [torch.float, torch.double]
+
+_signed_types = [
+    torch.half, torch.float, torch.double,
+    torch.int8, torch.short, torch.int, torch.long
+]
+
+_signed_types_no_half = [
+    torch.float, torch.double,
+    torch.int8, torch.short, torch.int, torch.long
+]
+
+_unsigned_types = [torch.uint8]
+
+# Helper values and functions for producing tensors and scalars to use in tensor op tests.
+# Tensor dimension sizes (Small, Medium, Large, Giant)
+_S = 5
+_M = 50
+_L = 1000
+_G = 275000000
+
+# Value to clamp divisors to since dividing by small numbers can be unstable
+# on devices.
+_div_min = 2**-8
+
+# Returns floating or integral scalar corresponding to dtype
+def _number(floating, integer, dtype):
+    if dtype in [torch.half, torch.float, torch.double]:
+        return floating
+    return integer
+
+# Converts half dtype to float when device is cpu
+def _convert_t(dtype, device):
+    if device == 'cpu' and dtype == torch.half:
+        return torch.float
+    return dtype
+
+# Returns a tensor of the requested shape, dtype, and device
+# Requesting a half CPU tensor returns a float CPU tensor with
+# values representable by a half.
+# Initialization uses randint for non-float types and randn for float types.
+def _make_tensor(shape, dtype, device, fill_ones=False):
+    # Returns a tensor filled with ones
+    if fill_ones:
+        return torch.ones(*shape, dtype=_convert_t(dtype, device), device=device)
+
+    # Returns a tensor with random integer values
+    if dtype not in _float_types:
+        t = torch.randint(0, 10, shape, device=device)
+        return t.to(_convert_t(dtype, device))
+
+    # Populates the CPU tensor with floats representable as halfs
+    if dtype == torch.half and device == 'cpu':
+        return torch.randn(*shape, dtype=torch.float, device=device).half().float()
+
+    # Default: returns a tensor with random float values
+    return torch.randn(shape, dtype=dtype, device=device).to(dtype=dtype)
+
+def _small_0d(dtype, device):
+    return _make_tensor((1,), dtype, device).squeeze()
+
+def _small_2d(dtype, device, has_zeros=True, fill_ones=False, oneish=False):
+    t = _make_tensor((_S, _S), dtype, device, fill_ones=fill_ones)
+    if oneish:
+        return t.clamp(min=_number(.99, 1, dtype), max=1.01)
+    if not has_zeros:
+        return t.clamp(min=(_number(_div_min, 1, dtype)))
+    return t
+
+def _small_3d(dtype, device, has_zeros=True, fill_ones=False, oneish=False):
+    t = _make_tensor((_S, _S, _S), dtype, device, fill_ones=fill_ones)
+    if oneish:
+        return t.clamp(min=_number(.99, 1, dtype), max=1.01)
+    if not has_zeros:
+        return t.clamp(min=(_number(_div_min, 1, dtype)))
+    return t
+
+def _small_3d_ones(dtype, device):
+    return _small_3d(dtype, device, fill_ones=True)
+
+def _small_3d_unique(dtype, device):
+    return (torch.randperm(_S * _S * _S,
+            dtype=_convert_t(dtype, device), device=device) + 1).view(_S, _S, _S)
+
+def _medium_1d(dtype, device):
+    return _make_tensor((_M,), dtype, device)
+
+def _medium_2d(dtype, device):
+    return _make_tensor((_M, _M), dtype, device)
+
+def _large_2d(dtype, device):
+    t = _make_tensor((_L, _L), dtype, device)
+    return t.normal_()
+
+def _giant_1d(dtype, device):
+    return _make_tensor((_G), dtype, device)
+
+# Helper method that returns a function which takes dtype and device and
+# instantiates tensors of the given shape.
+# Useful for tensor op tests with custom shapes.
+def _new_t(shape):
+    def tmp(dtype, device):
+        return _make_tensor(shape, dtype, device)
+    return tmp
+
+# TODO: random functions, cat, gather, scatter, index*, masked*,
+#       resize, resizeAs, storage_offset, storage, stride, unfold
+# Each tests is defined in tensor_op_tests as a tuple of:
+# - op name (string)
+# - (sub)test name (string)
+# - tensor constructor, takes dtype and device and constructs the tensor to run the op on
+# - arg constructor, takes dtype and device and constructs op arguments
+# - torch.half precision (=1e-5)
+# - precision (=1e-5), precision to use for all other dtypes
+# - make_inplace_variant (=True), if true the inplace version of the op (op_) is also tested
+# - dtype_list (=_types), a list of torch dtypes to test the op(s) with
+# - decorators (=[]), a list of decorators to apply to the test
+tensor_op_tests = [
+    ('add', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-2),
+    ('add', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d)], 1e-2),
+    ('sub', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-2),
+    ('sub', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d)], 1e-2),
+    ('mul', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-2),
+    ('mul', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d)], 1e-2),
+    ('mul', 'scalar', _small_0d, lambda t, d: [_small_0d(torch.int32, d)], 1e-2),
+    ('div', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-1),
+    ('div', 'tensor', _small_3d,
+        lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-1),
+    ('pow', '', _small_3d, lambda t, d: [_number(3.14, 3, t)], 1e-1, 1e-5, _float_types),
+    ('pow', '1', _small_3d, lambda t, d: [_number(1., 1, t)], 1e-1),
+    ('pow', '2', _small_3d, lambda t, d: [_number(2., 2, t)], 1e-1),
+    ('pow', '3', _small_3d, lambda t, d: [_number(3., 3, t)], 1e-1),
+    ('pow', '-1', _small_3d, lambda t, d: [_number(-1., -1, t)], 1e-1, 1e-5, _float_types),
+    ('pow', '-2', _small_3d, lambda t, d: [_number(-2., -2, t)],
+        1e-1, 1e-5, _float_types_no_half, False, [skipCUDAIfRocm]),
+    ('pow', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d).abs()],
+        1e-1, 1e-5, _float_types),
+    ('addbmm', '', _small_2d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
+        1e-1, 1e-4, _float_types),
+    ('addbmm', 'scalar', _small_2d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
+        1e-1, 1e-4, _float_types),
+    ('addbmm', 'two_scalars', _small_2d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
+        1e-1, 1e-4, _float_types),
+    ('baddbmm', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('baddbmm', 'scalar', _small_3d, lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('baddbmm', 'two_scalars', _small_3d, lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('bmm', '', _small_3d, lambda t, d: [_small_3d(t, d)],
+        1e-5, 1e-5, _float_types_no_half, False),
+    ('addcdiv', '', _small_2d,
+        lambda t, d: [_small_2d(t, d),
+                      _small_2d(t, d, has_zeros=False)], 1, 1e-3),
+    ('addcdiv', 'scalar', _small_2d,
+        lambda t, d: [_number(2.8, 1, t), _small_2d(t, d),
+                      _small_2d(t, d, has_zeros=False)], 1, 1e-3),
+    ('addcmul', '', _small_3d, lambda t, d: [_small_3d(t, d), _small_3d(t, d)], 1e-2, 1e-3),
+    ('addcmul', 'scalar', _small_3d,
+        lambda t, d: [_number(0.4, 2, t), _small_3d(t, d), _small_3d(t, d)], 1e-2),
+    ('addmm', '', _medium_2d, lambda t, d: [_medium_2d(t, d), _medium_2d(t, d)],
+        1e-1, 1e-4, _float_types),
+    ('addmm', 'scalar', _medium_2d,
+        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)],
+        1e-1, 1e-4, _float_types),
+    ('addmm', 'two_scalars', _medium_2d,
+        lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_2d(t, d)],
+        1e-1, 1e-4, _float_types),
+    ('addmv', '', _medium_1d, lambda t, d: [_medium_2d(t, d), _medium_1d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('addmv', 'scalar', _medium_1d,
+        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('addmv', 'two_scalars', _medium_1d,
+        lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('addr', '', _medium_2d, lambda t, d: [_medium_1d(t, d), _medium_1d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('addr', 'scalar', _medium_2d,
+        lambda t, d: [_number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('addr', 'two_scalars', _medium_2d,
+        lambda t, d: [_number(0.5, 3, t), _number(0.4, 2, t), _medium_1d(t, d), _medium_1d(t, d)],
+        1e-2, 1e-4, _float_types),
+    ('atan2', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-2, 1e-5, _float_types),
+    ('fmod', 'value', _small_3d, lambda t, d: [3], 1e-3),
+    ('fmod', 'tensor', _small_3d, lambda t, d: [_small_3d(t, d, has_zeros=False)], 1e-3),
+    ('chunk', '', _medium_2d, lambda t, d: [4], 1e-5, 1e-5, _types, False),
+    ('chunk', 'dim', _medium_2d, lambda t, d: [4, 1], 1e-5, 1e-5, _types, False),
+    ('chunk', 'neg_dim', _medium_2d, lambda t, d: [4, -2], 1e-5, 1e-5, _types, False),
+    ('clamp', 'neg', _medium_2d, lambda t, d: [-1, 5], 1e-5, 1e-5, _signed_types),
+    ('clamp', 'pos', _medium_2d, lambda t, d: [1, 5], 1e-5, 1e-5, _unsigned_types),
+    ('clone', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('cross', '', _new_t((_M, 3, _M)), lambda t, d: [_new_t((_M, 3, _M))(t, d)],
+        1e-2, 1e-5, _types, False),
+    ('cumprod', '', _small_3d, lambda t, d: [1], 1e-2, 1e-4, _types, False),
+    ('cumprod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-4, _types, False),
+    ('cumsum', '', _small_3d, lambda t, d: [1], 1e-2, 1e-5, _types, False),
+    ('cumsum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, _types, False),
+    ('dim', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('dist', '', _small_2d, lambda t, d: [_small_2d(t, d)], 1e-2, 1e-5, _float_types, False),
+    ('dist', '3_norm', _small_2d, lambda t, d: [_small_2d(t, d), 3], 1e-2, 1e-5, _float_types, False),
+    ('dist', '2_5_norm', _small_2d, lambda t, d: [_small_2d(t, d), 2.5],
+        1e-2, 1e-5, _float_types, False),
+    ('dot', '', _medium_1d, lambda t, d: [_medium_1d(t, d)],
+        1e-2, 1e-5, _float_types, False, [skipCUDAIfRocm]),
+    ('element_size', '', _medium_1d, lambda t, d: [], 1e-5, 1e-5, _float_types_no_half, False),
+    ('eq', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)],),
+    ('eq', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)]),
+    ('ne', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)],),
+    ('ne', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)]),
+    ('equal', 'equal', _small_3d_ones, lambda t, d: [_small_3d_ones(t, d)],
+        1e-5, 1e-5, _types, False),
+    ('equal', '', _small_3d_ones, lambda t, d: [_small_3d(t, d)], 1e-5, 1e-5, _types, False),
+    ('expand', '', _new_t((_M, 1, _M)), lambda t, d: [_M, 4, _M], 1e-5, 1e-5, _types, False),
+    ('expand_as', '', _new_t((_M, 1, _M)), lambda t, d: [_new_t((_M, 4, _M))(t, d)],
+        1e-5, 1e-5, _types, False),
+    ('fill_', '', _medium_2d, lambda t, d: [_number(3.14, 3, t)], 1e-3, 1e-5, _types, False),
+    ('ge', '', _medium_2d, lambda t, d: [_medium_2d(t, d)],),
+    ('le', '', _medium_2d, lambda t, d: [_medium_2d(t, d)],),
+    ('gt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)],),
+    ('lt', '', _medium_2d, lambda t, d: [_medium_2d(t, d)],),
+    ('is_contiguous', '', _medium_2d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    # TODO: can't check negative case - cross-device copy is contiguous
+    ('is_same_size', 'negative', _medium_2d, lambda t, d: [_small_3d(t, d)],
+        1e-5, 1e-5, _types, False),
+    ('is_same_size', 'positive', _medium_2d, lambda t, d: [_medium_2d(t, d)],
+        1e-5, 1e-5, _types, False),
+    ('is_set_to', '', _medium_2d, lambda t, d: [_medium_2d(t, d)], 1e-5, 1e-5, _types, False),
+    # TODO: positive case
+    ('kthvalue', '', _small_3d_unique, lambda t, d: [3], 1e-5, 1e-5, _types, False),
+    ('kthvalue', 'dim', _small_3d_unique, lambda t, d: [3, 1], 1e-5, 1e-5, _types, False),
+    ('kthvalue', 'neg_dim', _small_3d_unique, lambda t, d: [3, -1], 1e-5, 1e-5, _types, False),
+    ('lerp', '', _small_3d, lambda t, d: [_small_3d(t, d), 0.3],
+        1e-2, 1e-5, _float_types_no_half),
+    ('max', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('max', 'dim', _small_3d_unique, lambda t, d: [1], 1e-5, 1e-5, _types, False),
+    ('max', 'neg_dim', _small_3d_unique, lambda t, d: [-1], 1e-5, 1e-5, _types, False),
+    ('max', 'elementwise', _medium_2d, lambda t, d: [_medium_2d(t, d)],
+        1e-5, 1e-5, _types, False),
+    ('min', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('min', 'dim', _small_3d_unique, lambda t, d: [1], 1e-5, 1e-5, _types, False),
+    ('min', 'neg_dim', _small_3d_unique, lambda t, d: [-1], 1e-5, 1e-5, _types, False),
+    ('min', 'elementwise', _medium_2d, lambda t, d: [_medium_2d(t, d)],
+        1e-5, 1e-5, _types, False),
+    ('mean', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types, False),
+    ('mean', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, _float_types, False),
+    ('mean', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, _float_types, False),
+    # Double here because the CPU result will be wrong otherwise
+    ('mean', '64bit_indexing', _giant_1d, lambda t, d: [],
+        1e-3, 1e-5, [torch.double], False, [slowTest]),
+    ('mode', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('mode', 'dim', _small_3d, lambda t, d: [1], 1e-5, 1e-5, _types, False),
+    ('mode', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-5, 1e-5, _types, False),
+    ('mvlgamma', '2d_p=1', lambda t, d: _small_2d(t, d).clamp(0.1, 10), lambda t, d: [1],
+        1e-5, 1e-5, _float_types_no_half),
+    ('mvlgamma', '2d_p=2', lambda t, d: _small_2d(t, d).clamp(0.6, 10), lambda t, d: [2],
+        1e-5, 1e-5, _float_types_no_half),
+    ('remainder', 'value', _small_3d, lambda t, d: [3], 1e-1, 1e-5, _signed_types),
+    ('remainder', 'negative_value', _small_3d, lambda t, d: [-3], 1e-1, 1e-5, _signed_types),
+    ('remainder', 'tensor', _small_3d,
+        lambda t, d: [_small_3d(t, d, has_zeros=False)],
+        1e-1, 1e-5, _signed_types),
+    ('remainder', 'negative_tensor', _small_3d,
+        lambda t, d: [0 - _small_3d(t, d, has_zeros=False)],
+        1e-1, 1e-5, _signed_types),
+    ('std', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types, False),
+    ('std', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, _float_types, False),
+    ('std', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, _float_types, False),
+    ('var', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types, False),
+    ('var', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, _float_types, False),
+    ('var', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, _float_types, False),
+    ('ndimension', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('nelement', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('numel', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('narrow', '', _small_3d, lambda t, d: [1, 3, 2], 1e-5, 1e-5, _types, False),
+    ('narrow', 'neg_dim', _small_3d, lambda t, d: [-1, 3, 2], 1e-5, 1e-5, _types, False),
+    ('nonzero', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('norm', '', _small_3d, lambda t, d: [], 1e-1, 1e-5, _float_types, False),
+    ('norm', '3_norm', _small_3d, lambda t, d: [3], 1e-1, 1e-5, _float_types, False),
+    ('norm', '3_norm_dim', _small_3d, lambda t, d: [3, 0], 1e-1, 1e-5, _float_types, False),
+    ('norm', '3_norm_neg_dim', _small_3d, lambda t, d: [3, -2], 1e-1, 1e-5, _float_types, False),
+    ('new_ones', '', _small_3d, lambda t, d: [1, 2, 3, 4, 5], 1e-5, 1e-5, _types, False),
+    ('permute', '', _new_t((1, 2, 3, 4)), lambda t, d: [2, 1, 3, 0], 1e-5, 1e-5, _types, False),
+    ('put_', '', _new_t((2, 5, 3)),
+        lambda t, d: [torch.LongTensor([[0], [-2]]).to(device=d),
+                      torch.LongTensor([[3], [4]]).to(dtype=_convert_t(t, d), device=d)],
+        1e-5, 1e-5, _types, False),
+    ('put_', 'empty', _new_t((2, 3)),
+        lambda t, d: [torch.LongTensor([]).to(device=d), torch.LongTensor([]).to(dtype=_convert_t(t, d), device=d)],
+        1e-5, 1e-5, _types, False),
+    ('put_', 'accumulate', _new_t((2, 2)),
+        lambda t, d: [torch.LongTensor([[1], [-3]]).to(device=d),
+                      torch.LongTensor([[1], [2]]).to(dtype=_convert_t(t, d), device=d),
+                      True],
+        1e-5, 1e-5, _types, False),
+    ('prod', '', lambda t, d: _small_2d(t, d, oneish=True),
+        lambda t, d: [], 1e-2, 1e-5, _types, False),
+    ('prod', 'dim', _small_3d, lambda t, d: [1], 1e-3, 1e-5, _types, False),
+    ('prod', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-3, 1e-5, _types, False),
+    ('sum', '', _small_2d, lambda t, d: [], 1e-2, 1e-5, _types, False),
+    ('sum', 'dim', _small_3d, lambda t, d: [1], 1e-2, 1e-5, _types, False),
+    ('sum', 'neg_dim', _small_3d, lambda t, d: [-1], 1e-2, 1e-5, _types, False),
+    ('renorm', '2_norm', _small_3d, lambda t, d: [2, 1, 1], 1e-3, 1e-5, _float_types),
+    ('renorm', '2_norm_neg_dim', _small_3d, lambda t, d: [2, -1, 1], 1e-3, 1e-5, _float_types),
+    ('renorm', '1_5_norm', _small_3d, lambda t, d: [1.5, 1, 1], 1e-3, 1e-5, _float_types),
+    ('repeat', '', _small_2d, lambda t, d: [2, 2, 2], 1e-5, 1e-5, _types, False),
+    ('size', '', _new_t((1, 2, 3, 4)), lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('size', 'dim', _new_t((1, 2, 3, 4)), lambda t, d: [1], 1e-5, 1e-5, _types, False),
+    ('size', 'neg_dim', _new_t((1, 2, 3, 4)), lambda t, d: [-2], 1e-5, 1e-5, _types, False),
+    ('sort', '', _small_3d_unique, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('sort', 'dim', _small_3d_unique, lambda t, d: [1], 1e-5, 1e-5, _types, False),
+    ('sort', 'neg_dim', _small_3d_unique, lambda t, d: [-1], 1e-5, 1e-5, _types, False),
+    ('sort', 'dim_descending', _small_3d_unique, lambda t, d: [1, True], 1e-5, 1e-5, _types, False),
+    ('sort', 'neg_dim_descending', _small_3d_unique, lambda t, d: [-1, True], 1e-5, 1e-5, _types, False),
+    ('split', '', _small_3d, lambda t, d: [2], 1e-5, 1e-5, _types, False),
+    ('split', 'dim', _small_3d, lambda t, d: [2, 1], 1e-5, 1e-5, _types, False),
+    ('split', 'neg_dim', _small_3d, lambda t, d: [2, -3], 1e-5, 1e-5, _types, False),
+    ('squeeze', '', _new_t((1, 2, 1, 4)), lambda t, d: [],),
+    ('squeeze', 'dim', _new_t((1, 2, 1, 4)), lambda t, d: [2], ),
+    ('squeeze', 'neg_dim', _new_t((1, 2, 1, 4)), lambda t, d: [-2], ),
+    ('t', '', _new_t((1, 2)), lambda t, d: [],),
+    ('take', '', _new_t((3, 4)),
+        lambda t, d: [torch.LongTensor([[0], [-2]]).to(device=d)],
+        1e-5, 1e-5, _types, False),
+    ('transpose', '', _new_t((1, 2, 3, 4)), lambda t, d: [1, 2],),
+    ('transpose', 'neg_dim', _new_t((1, 2, 3, 4)), lambda t, d: [-1, -2], ),
+    ('tolist', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('topk', 'dim_sort', _small_3d_unique, lambda t, d: [2, 1, False, True],
+        1e-5, 1e-5, _types, False),
+    ('topk', 'neg_dim_sort', _small_3d_unique, lambda t, d: [2, -1, False, True],
+        1e-5, 1e-5, _types, False),
+    ('topk', 'dim_desc_sort', _small_3d_unique, lambda t, d: [2, 1, True, True],
+        1e-5, 1e-5, _types, False),
+    ('trace', '', _medium_2d, lambda t, d: [], 1e-3, 1e-5, _types, False),
+    ('tril', '', _medium_2d, lambda t, d: [],),
+    ('tril', 'zero_stride', _medium_2d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('tril', 'positive', _medium_2d, lambda t, d: [2], ),
+    ('tril', 'negative', _medium_2d, lambda t, d: [-2], ),
+    ('triu', '', _medium_2d, lambda t, d: [],),
+    ('triu', 'zero_stride', _medium_2d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('triu', 'positive', _medium_2d, lambda t, d: [2], ),
+    ('triu', 'negative', _medium_2d, lambda t, d: [-2], ),
+    ('unsqueeze', '', _new_t((2, 3, 4)), lambda t, d: [2],),
+    ('unsqueeze', 'neg_dim', _new_t((2, 3, 4)), lambda t, d: [-2], ),
+    ('view', 'contiguous', _small_3d, lambda t, d: [25, 5], 1e-5, 1e-5, _types, False),
+    ('view_as', '', _small_3d, lambda t, d: [_make_tensor((25, 5), t, d)],
+        1e-5, 1e-5, _types, False),
+    ('zero_', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('new_zeros', '', _small_3d, lambda t, d: [1, 2, 3, 4], 1e-5, 1e-5, _types, False),
+    ('flip', 'd0', _small_3d, lambda t, d: [0], 1e-5, 1e-5, _types, False),
+    ('flip', 'd012', _small_3d, lambda t, d: [0, 1, 2], 1e-5, 1e-5, _types, False),
+    ('flip', 'd02', _small_3d, lambda t, d: [0, 2], 1e-5, 1e-5, _types, False),
+    ('flip', 'd20', _small_3d, lambda t, d: [2, 0], 1e-5, 1e-5, _types, False),
+    ('flip', 'neg_d', _small_3d, lambda t, d: [-1], 1e-5, 1e-5, _types, False),
+    ('rot90', 'k1_d01', _small_2d, lambda t, d: [1, [0, 1]], 1e-5, 1e-5, _types, False),
+    ('rot90', 'k1_d12', _small_3d, lambda t, d: [1, [1, 2]], 1e-5, 1e-5, _types, False),
+    ('rot90', 'k1_neg_d', _small_3d, lambda t, d: [1, [1, -1]], 1e-5, 1e-5, _types, False),
+    ('rot90', 'default', _small_3d, lambda t, d: [], 1e-5, 1e-5, _types, False),
+    ('rsqrt', '', lambda t, d: _small_3d(t, d) + 1, lambda t, d: [], 1e-2, 1e-4, _float_types_no_half),
+    ('sinh', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('tan', '', lambda t, d: _small_3d(t, d).clamp(-1, 1), lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('__lshift__', '',
+        lambda t, d: torch.pow(2, torch.arange(1, 5).to(dtype=_convert_t(t, d), device=d)),
+        lambda t, d: [2],
+        1e-3, 1e-3, _signed_types_no_half, False),
+    ('__rshift__', '',
+        lambda t, d: torch.pow(2, torch.arange(3, 7).to(dtype=_convert_t(t, d), device=d)),
+        lambda t, d: [2],
+        1e-3, 1e-3, _signed_types_no_half, False),
+    # lapack tests
+    ('qr', 'square', _small_2d, lambda t, d: [],
+        1e-5, 3e-4, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('qr', 'skinny', _new_t((3, 4)), lambda t, d: [],
+        1e-5, 3e-4, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('qr', 'fat', _new_t((4, 3)), lambda t, d: [],
+        1e-5, 3e-4, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('qr', 'big', _large_2d, lambda t, d: [],
+        1e-5, 3e-4, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('geqrf', '', _new_t((20, 20)), lambda t, d: [],
+        1e-5, 3e-4, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('svd', 'square', _new_t((10, 10)), lambda t, d: [],
+        1e-5, 1e-5, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('svd', 'square_col_maj', lambda t, d: _new_t((10, 10))(t, d).t(), lambda t, d: [True],
+        1e-5, 1e-5, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('svd', 'tall_some', _new_t((20, 5)), lambda t, d: [True],
+        1e-5, 1e-5, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('svd', 'tall_all', _new_t((20, 5)), lambda t, d: [False],
+        1e-5, 1e-5, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('svd', 'tall_some_col_maj', lambda t, d: _new_t((5, 20))(t, d).t(), lambda t, d: [True],
+        1e-5, 1e-5, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('svd', 'tall_all_col_maj', lambda t, d: _new_t((5, 20))(t, d).t(), lambda t, d: [False],
+        1e-5, 1e-5, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('eig', 'with_eigvec', _new_t((10, 10)), lambda t, d: [True],
+        1e-5, 1e-5, _float_types_no_half, False, [skipCUDAIfNoMagma]),
+    ('abs', '', _small_3d, lambda t, d: []),
+    ('sign', '', _small_3d, lambda t, d: []),
+    ('log', '', _small_3d, lambda t, d: [], 1e-2, 1e-5, _float_types),
+    ('log10', '', _small_3d, lambda t, d: [], 1e-2, 1e-5, _float_types),
+    ('log1p', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types_no_half),
+    ('log2', '', _small_3d, lambda t, d: [], 1e-2, 1e-5, _float_types),
+    ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('sin', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('acos', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('asin', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('atan', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('cos', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('cosh', '', _small_3d, lambda t, d: [], 1e-2, 1e-5, _float_types),
+    ('erf', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('erfc', '', _small_3d, lambda t, d: [], 1e-3, 1e-5, _float_types),
+    ('exp', '', _small_3d, lambda t, d: [], 1e-2, 1e-5, _float_types),
+    ('expm1', '', _small_3d, lambda t, d: [], 1e-2, 1e-5, _float_types),
+    ('reciprocal', '', _small_3d, lambda t, d: [], 1e-1, 1e-5, _float_types),
+    ('floor', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _float_types),
+    ('frac', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _float_types),
+    ('neg', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _float_types),
+    ('round', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _float_types),
+    ('trunc', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _float_types),
+    ('ceil', '', _small_3d, lambda t, d: [], 1e-5, 1e-5, _float_types),
+    ('lgamma', '', _small_3d, lambda t, d: [], 1e-2, 1e-5, _float_types_no_half),
+    ('digamma', 'op', _small_3d, lambda t, d: [], 1e-5, 1e0, _float_types_no_half),
+]
+
+# Creates and decorates a generic test and adds it to the class.
+def generate_test_function(cls,
+                           op_str,
+                           subtest_str,
+                           tensor_ctor,
+                           arg_ctor,
+                           half_precision,
+                           float_precision,
+                           dtype_list,
+                           decorators):
+    def fn(self, device, dtype):
+        # Generates the CPU inputs
+        # Note: CPU tensors are never torch.half
+        cpu_tensor = tensor_ctor(dtype, 'cpu')
+        cpu_args = arg_ctor(dtype, 'cpu')
+
+        # Converts CPU tensors to device tensors
+        device_tensor = cpu_tensor.to(dtype=dtype, device=device)
+        device_args = [arg.to(device=device) if torch.is_tensor(arg) else arg for arg in cpu_args]
+
+        # Converts float device tensors to half when the dtype is half
+        # Note: CPU half tensors don't support many operations.
+        if dtype == torch.half:
+            device_args = [arg.to(dtype=dtype) if
+                           (torch.is_tensor(arg) and arg.dtype == torch.float) else arg
+                           for arg in device_args]
+
+        # Runs the tensor op on CPU and device
+        cpu_result = getattr(cpu_tensor, op_str)(*cpu_args)
+        device_result = getattr(device_tensor, op_str)(*device_args)
+
+        # Compares CPU and device inputs and outputs
+        precision = half_precision if dtype == torch.half else float_precision
+
+        self.assertEqual(cpu_tensor, device_tensor, prec=precision)
+        self.assertEqual(cpu_args, device_args, prec=precision)
+        self.assertEqual(cpu_result, device_result, prec=precision)
+
+    test_name = "test_" + op_str + subtest_str
+    assert not hasattr(cls, test_name), "{0} already in TestDevicePrecision".format(test_name)
+
+    # Constructs decorator list and applies decorators
+    if decorators is None:
+        decorators = [dtypes(*dtype_list)]
+    else:
+        decorators = decorators + [dtypes(*dtype_list)]
+
+    for dec in decorators:
+        fn = dec(fn)
+
+    setattr(cls, test_name, fn)
+
+# Instantiates variants of tensor_op_tests and adds them to the given class.
+def generate_tensor_op_tests(cls):
+
+    def caller(cls,
+               op_str,
+               subtest_str,
+               tensor_ctor,
+               arg_ctor,
+               half_precision=1e-5,
+               float_precision=1e-5,
+               dtype_list=_types,
+               make_inplace_variant=True,
+               decorators=None):
+        if subtest_str:
+            subtest_str = '_' + subtest_str
+
+        generate_test_function(cls, op_str, subtest_str, tensor_ctor, arg_ctor,
+                               half_precision, float_precision, dtype_list, decorators)
+
+        if make_inplace_variant:
+            op_str = op_str + '_'
+            subtest_str = 'inplace' + subtest_str
+            generate_test_function(cls, op_str, subtest_str, tensor_ctor, arg_ctor,
+                                   half_precision, float_precision, dtype_list, decorators)
+
+    for test in tensor_op_tests:
+        caller(cls, *test)
+
+
+class TestTensorDeviceOps(TestCase):
+    pass
+
 
 class TestTorch(TestCase, _TestTorchMixin):
     pass
 
+
+# Generates tests
+# Note: test generation must be done at file scope, not within main, or
+# pytest will fail.
+add_neg_dim_tests()
+generate_tensor_op_tests(TestTensorDeviceOps)
+instantiate_device_type_tests(TestTorchDeviceType, globals())
+instantiate_device_type_tests(TestDevicePrecision, globals(), except_for='cpu')
+instantiate_device_type_tests(TestTensorDeviceOps, globals(), except_for='cpu')
+
 if __name__ == '__main__':
     run_tests()

From 13c39c8ecc785569f2b356c29aca4d626353113d Mon Sep 17 00:00:00 2001
From: Francisco Massa <fmassa@fb.com>
Date: Fri, 4 Oct 2019 04:54:59 -0700
Subject: [PATCH 050/113] Remove six dependency (#27282)

Summary:
https://github.com/pytorch/pytorch/pull/27136 added a dependency on `six`, which is not available by default and is not marked as a dependency on PyTorch binaries, causing torchvision CI to break, see https://circleci.com/gh/pytorch/vision/20778?utm_campaign=vcs-integration-link&utm_medium=referral&utm_source=github-build-link for example.

This PR use `torch._six` instead of `six` as a replacement.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27282

Reviewed By: lerks

Differential Revision: D17737561

Pulled By: fmassa

fbshipit-source-id: 7dcd0cc2c8bab27b8f4535f664f60388818d3497
---
 torch/distributed/internal_rpc_utils.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/torch/distributed/internal_rpc_utils.py b/torch/distributed/internal_rpc_utils.py
index 2734294b875f9..99834a929701e 100644
--- a/torch/distributed/internal_rpc_utils.py
+++ b/torch/distributed/internal_rpc_utils.py
@@ -4,7 +4,6 @@
 import copyreg
 import io
 import pickle
-import six
 import threading
 import traceback
 
@@ -26,10 +25,10 @@ class _InternalRPCPickler:
     e.g. attach tensor to distributed autograd graph in C++
     """
     def __init__(self):
-        # python2 does not have dispatch_table, add "if six.PY3" condition,
+        # python2 does not have dispatch_table, add "if torch._six.PY3" condition,
         # as _InternalRPCPickler still got build in python2 even
         # we skipped python 2 tests for rpc_test
-        if six.PY3:
+        if torch._six.PY3:
             self._dispatch_table = copyreg.dispatch_table.copy()
             self._dispatch_table[torch.Tensor] = self._tensor_reducer
 

From e29baaca3d9700909492f4b6ee8bb57737783d21 Mon Sep 17 00:00:00 2001
From: Richard Zou <zou3519@gmail.com>
Date: Fri, 4 Oct 2019 07:17:17 -0700
Subject: [PATCH 051/113] Make `align_to` method-only. (#27304)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27304

The ellipsis version of `align_to` only works if it is called as a
method. To prevent any confusion, this PR disables `torch.align_to` (but
keeps `Tensor.align_to`.

Test Plan: - [namedtensor ci]

Differential Revision: D17743809

Pulled By: zou3519

fbshipit-source-id: cf5c53dcf45ba244f61bb1e00e4853de5db6c241
---
 aten/src/ATen/native/native_functions.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index b4ce663e6cb58..910a6c867eb40 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -66,7 +66,7 @@
   supports_named_tensor: True
 
 - func: align_to(Tensor(a) self, DimnameList names) -> Tensor(a)
-  variants: function, method
+  variants: method
   supports_named_tensor: True
 
 - func: align_as(Tensor self, Tensor other) -> Tensor

From 081069e8ca87201ca80a5cf8bdd2a6fb204c22fb Mon Sep 17 00:00:00 2001
From: Hong Xu <hong@topbug.net>
Date: Fri, 4 Oct 2019 07:48:19 -0700
Subject: [PATCH 052/113] Remove CUDA_VERSION from Python script (which has
 already been detected in CMake) (#27316)

Summary:
(Intentionally left blank)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27316

Differential Revision: D17762715

Pulled By: ezyang

fbshipit-source-id: 044c0ea6e8c2d12912c946a9a50b934b5253d8c8
---
 setup.py                    | 23 ++++++++++++----------
 tools/setup_helpers/cuda.py | 38 -------------------------------------
 2 files changed, 13 insertions(+), 48 deletions(-)

diff --git a/setup.py b/setup.py
index ad94b94eb21e9..1850db6600905 100644
--- a/setup.py
+++ b/setup.py
@@ -184,7 +184,7 @@
 from tools.setup_helpers.env import (IS_WINDOWS, IS_DARWIN,
                                      check_env_flag, build_type)
 from tools.setup_helpers.cmake import CMake
-from tools.setup_helpers.cuda import CUDA_HOME, CUDA_VERSION
+from tools.setup_helpers.cuda import CUDA_HOME
 
 try:
     FileNotFoundError
@@ -280,15 +280,6 @@ def report(*args):
 # all the work we need to do _before_ setup runs
 def build_deps():
     report('-- Building version ' + version)
-    version_path = os.path.join(cwd, 'torch', 'version.py')
-    with open(version_path, 'w') as f:
-        f.write("__version__ = '{}'\n".format(version))
-        # NB: This is not 100% accurate, because you could have built the
-        # library code with DEBUG, but csrc without DEBUG (in which case
-        # this would claim to be a release build when it's not.)
-        f.write("debug = {}\n".format(repr(build_type.is_debug())))
-        f.write("cuda = {}\n".format(repr(CUDA_VERSION)))
-        f.write("git_version = {}\n".format(repr(sha)))
 
     def check_file(f):
         if not os.path.exists(f):
@@ -318,6 +309,18 @@ def check_file(f):
                  rerun_cmake=RERUN_CMAKE,
                  cmake_only=CMAKE_ONLY,
                  cmake=cmake)
+
+    version_path = os.path.join(cwd, 'torch', 'version.py')
+    with open(version_path, 'w') as f:
+        f.write("__version__ = '{}'\n".format(version))
+        # NB: This is not 100% accurate, because you could have built the
+        # library code with DEBUG, but csrc without DEBUG (in which case
+        # this would claim to be a release build when it's not.)
+        f.write("debug = {}\n".format(repr(build_type.is_debug())))
+        cmake_cache_vars = defaultdict(lambda: None, cmake.get_cmake_cache_variables())
+        f.write("cuda = {}\n".format(repr(cmake_cache_vars['CUDA_VERSION'])))
+        f.write("git_version = {}\n".format(repr(sha)))
+
     if CMAKE_ONLY:
         report('Finished running cmake. Run "ccmake build" or '
                '"cmake-gui build" to adjust build options and '
diff --git a/tools/setup_helpers/cuda.py b/tools/setup_helpers/cuda.py
index 3a1a27712e6e8..06f549d3d00b2 100644
--- a/tools/setup_helpers/cuda.py
+++ b/tools/setup_helpers/cuda.py
@@ -1,8 +1,6 @@
 import os
 import glob
-import re
 import ctypes.util
-from subprocess import Popen, PIPE
 
 from . import which
 from .env import IS_WINDOWS, IS_LINUX, IS_DARWIN, check_negative_env_flag
@@ -19,44 +17,9 @@ def find_nvcc():
         return None
 
 
-def find_cuda_version(cuda_home):
-    if cuda_home is None:
-        return None
-    if IS_WINDOWS:
-        candidate_names = [os.path.basename(cuda_home)]
-    else:
-        # get CUDA lib folder
-        cuda_lib_dirs = ['lib64', 'lib']
-        for lib_dir in cuda_lib_dirs:
-            cuda_lib_path = os.path.join(cuda_home, lib_dir)
-            if os.path.exists(cuda_lib_path):
-                break
-        # get a list of candidates for the version number
-        # which are files containing cudart
-        candidate_names = list(glob.glob(os.path.join(cuda_lib_path, '*cudart*')))
-        candidate_names = [os.path.basename(c) for c in candidate_names]
-        # if we didn't find any cudart, ask nvcc
-        if len(candidate_names) == 0:
-            proc = Popen(['nvcc', '--version'], stdout=PIPE, stderr=PIPE)
-            out, err = proc.communicate()
-            candidate_names = [out.decode().rsplit('V')[-1]]
-
-    # suppose version is MAJOR.MINOR.PATCH, all numbers
-    version_regex = re.compile(r'[0-9]+\.[0-9]+\.[0-9]+')
-    candidates = [c.group() for c in map(version_regex.search, candidate_names) if c]
-    if len(candidates) > 0:
-        # normally only one will be retrieved, take the first result
-        return candidates[0]
-    # if no candidates were found, try MAJOR.MINOR
-    version_regex = re.compile(r'[0-9]+\.[0-9]+')
-    candidates = [c.group() for c in map(version_regex.search, candidate_names) if c]
-    if len(candidates) > 0:
-        return candidates[0]
-
 if check_negative_env_flag('USE_CUDA'):
     USE_CUDA = False
     CUDA_HOME = None
-    CUDA_VERSION = None
 else:
     if IS_LINUX or IS_DARWIN:
         CUDA_HOME = os.getenv('CUDA_HOME', LINUX_HOME)
@@ -78,5 +41,4 @@ def find_cuda_version(cuda_home):
             CUDA_HOME = os.path.dirname(cuda_path)
         else:
             CUDA_HOME = None
-    CUDA_VERSION = find_cuda_version(CUDA_HOME)
     USE_CUDA = CUDA_HOME is not None

From 76fc0285336b8127b97351d1b1dc2d54f6517676 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Fri, 4 Oct 2019 07:59:13 -0700
Subject: [PATCH 053/113] Revert D17743310: [pytorch][PR] Allow use
 cpu_serial_kernel with void-lambda

Test Plan: revert-hammer

Differential Revision:
D17743310

Original commit changeset: a149751f2d67

fbshipit-source-id: 043240201d67966dd08b7b1bc2f9bf4897923e00
---
 aten/src/ATen/native/cpu/IsContiguous.h     | 40 ++++--------------
 aten/src/ATen/native/cpu/Loops.h            | 41 +++++--------------
 aten/src/ATen/test/tensor_iterator_test.cpp | 45 +--------------------
 3 files changed, 18 insertions(+), 108 deletions(-)

diff --git a/aten/src/ATen/native/cpu/IsContiguous.h b/aten/src/ATen/native/cpu/IsContiguous.h
index b44e501171fff..6392746a8013a 100644
--- a/aten/src/ATen/native/cpu/IsContiguous.h
+++ b/aten/src/ATen/native/cpu/IsContiguous.h
@@ -5,58 +5,34 @@ namespace at { namespace native { namespace {
 // n: number of function arguments (arity)
 // traits: function_traits (see FunctionTraits.h)
 // s: index of scalar argument or -1
-template <int n, int stride_index, typename traits, int s=-1>
+template <int n, typename traits, int s=-1>
 struct IsContiguous {
   static bool eval(const int64_t* strides) {
     using type = typename traits::template arg<n - 1>::type;
-    return strides[stride_index] == (s == n ? 0 : sizeof(type)) &&
-           IsContiguous<n - 1, stride_index - 1, traits, s>::eval(strides);
+    return strides[n] == (s == n ? 0 : sizeof(type)) &&
+        IsContiguous<n - 1, traits, s>::eval(strides);
   }
 };
 
-// will be called when there is an output exists
 template <typename traits, int s>
-struct IsContiguous<0, 0, traits, s> {
+struct IsContiguous<0, traits, s> {
   static bool eval(const int64_t* strides) {
     return strides[0] == sizeof(typename traits::result_type);
   }
 };
 
-// will be called when there is no output
-template <typename traits, int s>
-struct IsContiguous<0, -1, traits, s> {
-  static bool eval(const int64_t* strides) {
-    return true;
-  }
-};
-
 // output and all inputs are contiguous
-template <typename traits,
-    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+template <typename traits>
 static inline bool is_contiguous(const int64_t* strides) {
-  return IsContiguous<traits::arity, traits::arity - 1, traits>::eval(strides);
-}
-
-template <typename traits,
-    typename std::enable_if<not std::is_void<typename traits::result_type>::value>::type* = nullptr>
-static inline bool is_contiguous(const int64_t* strides) {
-  return IsContiguous<traits::arity, traits::arity, traits>::eval(strides);
+  return IsContiguous<traits::arity, traits>::eval(strides);
 }
 
 // input at `s` is scalar (stride 0); output and other inputs are contiguous
 // NB: output is typically at strides[0] so first input corresponds to s=1
-template <typename traits, int s,
-    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
-static inline bool is_contiguous_scalar(const int64_t* strides) {
-  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
-  return IsContiguous<traits::arity, traits::arity - 1, traits, s>::eval(strides);
-}
-
-template <typename traits, int s,
-    typename std::enable_if<not std::is_void<typename traits::result_type>::value>::type* = nullptr>
+template <typename traits, int s>
 static inline bool is_contiguous_scalar(const int64_t* strides) {
   static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
-  return IsContiguous<traits::arity, traits::arity, traits, s>::eval(strides);
+  return IsContiguous<traits::arity, traits, s>::eval(strides);
 }
 
 }}}
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index 7ae0e4d0e93ba..e4bc9bc492b49 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -80,40 +80,13 @@ dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& o
   return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
 }
 
-template <typename func_t,
-    typename std::enable_if<not std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
-static inline void
-execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
-  using traits = function_traits<func_t>;
-  using result_type = typename traits::result_type;
-  for (; i < n; i++) {
-    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
-    *out_ptr = c10::guts::apply(op, dereference<traits>(
-        &data[1],
-        &strides[1],
-        i));
-  }
-}
-
-template <typename func_t,
-    typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
-static inline void
-execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
-  using traits = function_traits<func_t>;
-  for (; i < n; i++) {
-    c10::guts::apply(op, dereference<traits>(
-        &data[0],
-        &strides[0],
-        i));
-  }
-}
-
 // Basic loop operation (one output, N inputs). May be auto-vectorized
 // by the compiler. Supports inputs and outputs of different types.
 template <typename func_t>
 static inline void
 basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t op) {
   using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;
 
   // Copying strides to temporary array helps auto vectorization in older GCC
@@ -123,7 +96,13 @@ basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_
     strides[arg] = strides_[arg];
   }
 
-  execute_op(data, strides, i, n, op);
+  for (; i < n; i++) {
+    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+    *out_ptr = c10::guts::apply(op, dereference<traits>(
+      &data[1],
+      &strides[1],
+      i));
+  }
 }
 
 // Explicitly vectorized loop implementation. All inputs and outputs must be
@@ -226,8 +205,7 @@ void cpu_kernel_vec(TensorIterator& iter, func_t op, vec_func_t vop) {
 template <typename func_t>
 void cpu_serial_kernel(TensorIterator& iter, func_t op) {
   using traits = function_traits<func_t>;
-  TORCH_INTERNAL_ASSERT((std::is_void<typename traits::result_type>::value &&
-    iter.noutputs() == 0 && iter.ntensors() == traits::arity) || (iter.ntensors() >= traits::arity + 1));
+  TORCH_INTERNAL_ASSERT(iter.ntensors() >= traits::arity + 1);
 
   iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
     if (is_contiguous<traits>(strides)) {
@@ -239,7 +217,6 @@ void cpu_serial_kernel(TensorIterator& iter, func_t op) {
       });
     }
   }, {0, iter.numel()});
-  iter.cast_outputs();
 }
 
 }}}  // namespace at::native::<anonymous>
diff --git a/aten/src/ATen/test/tensor_iterator_test.cpp b/aten/src/ATen/test/tensor_iterator_test.cpp
index 6939d32ac7686..ef4d17a2f8b3f 100644
--- a/aten/src/ATen/test/tensor_iterator_test.cpp
+++ b/aten/src/ATen/test/tensor_iterator_test.cpp
@@ -61,41 +61,17 @@ TEST(TensorIteratorTest, SerialLoopUnary_##name) {                            \
   ASSERT_ANY_THROW(out.equal(expected));                                      \
 }
 
-#define NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE(ctype,name)                         \
-TEST(TensorIteratorTest, SerialLoopUnaryNoOutput_##name) {                     \
-  auto in = random_tensor_for_type(k##name);                                   \
-  auto iter = at::TensorIterator();                                            \
-  iter.add_input(in);                                                          \
-  iter.build();                                                                \
-  int64_t acc = 0;                                                             \
-  at::native::cpu_serial_kernel(iter, [&](ctype a) -> void { acc++; }); \
-  EXPECT_TRUE(acc == in.numel());                                              \
-}
-
 #define BINARY_TEST_ITER_FOR_TYPE(ctype,name)                                          \
 TEST(TensorIteratorTest, SerialLoopBinary_##name) {                                    \
   Tensor out;                                                                          \
   auto in1 = random_tensor_for_type(k##name);                                          \
   auto in2 = random_tensor_for_type(k##name);                                          \
   auto expected = in1.add(in2);                                                        \
-  auto iter = TensorIterator::binary_op(out, in1, in2);                                \
+  auto iter = TensorIterator::binary_op(out, in1, in2);                           \
   at::native::cpu_serial_kernel(iter, [=](ctype a, ctype b) -> int { return a + b; }); \
   ASSERT_ANY_THROW(out.equal(expected));                                               \
 }
 
-#define NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE(ctype,name)                          \
-TEST(TensorIteratorTest, SerialLoopBinaryNoOutput_##name) {                      \
-  auto in1 = random_tensor_for_type(k##name);                                    \
-  auto in2 = random_tensor_for_type(k##name);                                    \
-  auto iter = at::TensorIterator();                                              \
-  iter.add_input(in1);                                                           \
-  iter.add_input(in2);                                                           \
-  iter.build();                                                                  \
-  int64_t acc = 0;                                                               \
-  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b) -> void { acc++; }); \
-  EXPECT_TRUE(acc == in1.numel());                                               \
-}
-
 #define POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                                    \
 TEST(TensorIteratorTest, SerialLoopPointwise_##name) {                                              \
   Tensor out;                                                                                       \
@@ -113,21 +89,6 @@ TEST(TensorIteratorTest, SerialLoopPointwise_##name) {
   ASSERT_ANY_THROW(out.equal(expected));                                                            \
 }
 
-#define NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                \
-TEST(TensorIteratorTest, SerialLoopPoinwiseNoOutput_##name) {                             \
-  auto in1 = random_tensor_for_type(k##name);                                             \
-  auto in2 = random_tensor_for_type(k##name);                                             \
-  auto in3 = random_tensor_for_type(k##name);                                             \
-  auto iter = at::TensorIterator();                                                       \
-  iter.add_input(in1);                                                                    \
-  iter.add_input(in2);                                                                    \
-  iter.add_input(in3);                                                                    \
-  iter.build();                                                                           \
-  int64_t acc = 0;                                                                        \
-  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b, ctype c) -> void { acc++; }); \
-  EXPECT_TRUE(acc == in1.numel());                                                        \
-}
-
 // The alternative way to calculate a < b is (b - a).clamp(0).toBool()
 // To prevent an overflow in subtraction (b - a) for unsigned types(unit, bool)
 // we will convert in to int first
@@ -151,9 +112,6 @@ TEST(TensorIteratorTest, ComparisonLoopBinary_##name) {
 AT_FORALL_SCALAR_TYPES(UNARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(BINARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(POINTWISE_TEST_ITER_FOR_TYPE)
-AT_FORALL_SCALAR_TYPES(NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE)
-AT_FORALL_SCALAR_TYPES(NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE)
-AT_FORALL_SCALAR_TYPES(NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES_AND(Bool, COMPARISON_TEST_ITER_FOR_TYPE)
 
 TEST(TensorIteratorTest, SerialLoopSingleThread) {
@@ -214,4 +172,3 @@ TEST(TensorIteratorTest, DoNotComputeCommonDTypeIfOutputIsUndefined) {
   iter.compute_common_dtype_only_for_inputs();
   ASSERT_ANY_THROW(iter.build());
 }
-

From b7fb2b8862f7f2cf20120f684a3e422b86e9c2af Mon Sep 17 00:00:00 2001
From: Pearu Peterson <pearu.peterson@gmail.com>
Date: Fri, 4 Oct 2019 08:07:44 -0700
Subject: [PATCH 054/113] Implement pickle support for sparse tensors and
 torch.layout instances (#27062)

Summary:
Resolves issue https://github.com/pytorch/pytorch/issues/16667 and https://github.com/OpenMined/PySyft/issues/2326
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27062

Differential Revision: D17762932

Pulled By: ezyang

fbshipit-source-id: dd99c1f4ac8eb2286eb55aa20ce973f60ce7b7e1
---
 test/test_sparse.py    | 34 ++++++++++++++++++++++++++++++++++
 torch/_utils.py        |  9 +++++++++
 torch/serialization.py | 18 ++++++++++++++++++
 torch/tensor.py        | 10 ++++++++++
 4 files changed, 71 insertions(+)

diff --git a/test/test_sparse.py b/test/test_sparse.py
index f7795c68046a9..c470ed66d9bbf 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -3,6 +3,7 @@
 import itertools
 import functools
 import random
+import sys
 import unittest
 from common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, do_test_empty_full, load_tests
 from common_cuda import TEST_CUDA
@@ -2054,6 +2055,39 @@ def test_change_tensor_metadata(self):
         self.assertEqual(list(t.coalesce().indices().size()), [2, 1])
         self.assertEqual(list(t.coalesce().values().size()), [1, 3])
 
+    def test_pickle(self):
+        if sys.version_info[0] == 2:
+            import cPickle as pickle
+        else:
+            import pickle
+
+        shape_sparse_dim_nnz = [
+            ((), 0, 2),
+            ((0,), 0, 10),
+            ((2,), 0, 3),
+            ((100, 3), 1, 3),
+            ((100, 20, 3), 2, 0),
+            ((10, 0, 3), 0, 3),
+            ((10, 0, 3), 0, 0),
+        ]
+
+        for shape, sparse_dim, nnz in shape_sparse_dim_nnz:
+            indices_shape = torch.Size((sparse_dim, nnz))
+            values_shape = torch.Size((nnz,) + shape[sparse_dim:])
+            indices = torch.arange(indices_shape.numel(), dtype=self.index_tensor(0).dtype,
+                                   device=self.device).view(indices_shape)
+            for d in range(sparse_dim):
+                indices[d].clamp_(max=(shape[d] - 1))  # make it valid index
+            if self.is_uncoalesced and indices.numel() > 0:
+                indices[:, -1] = indices[:, 0]  # make it uncoalesced
+            values_numel = values_shape.numel()
+            values = torch.arange(values_numel, dtype=self.value_dtype,
+                                  device=self.device).view(values_shape).div_(values_numel / 2.)
+            sp_tensor = self.sparse_tensor(indices, values, shape)
+            serialized = pickle.dumps(sp_tensor)
+            sp_tensor_loaded = pickle.loads(serialized)
+            self.assertEqual(sp_tensor, sp_tensor_loaded)
+
 
 class TestUncoalescedSparse(TestSparse):
     def setUp(self):
diff --git a/torch/_utils.py b/torch/_utils.py
index d3748842aefbe..d8c40b140075b 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -140,11 +140,20 @@ def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, bac
     tensor._backward_hooks = backward_hooks
     return tensor
 
+
+def _rebuild_sparse_tensor(layout, data):
+    if layout == torch.sparse_coo:
+        indices, values, size = data
+        return torch.sparse_coo_tensor(indices, values, size)
+    raise NotImplementedError("rebuilding sparse tensor for layout %s" % (layout))
+
+
 def _rebuild_xla_tensor(data, dtype, device, requires_grad):
     tensor = torch.from_numpy(data).to(dtype=dtype, device=device)
     tensor.requires_grad = requires_grad
     return tensor
 
+
 def _rebuild_qtensor(storage, storage_offset, size, stride, quantizer_params, requires_grad, backward_hooks):
     qscheme = quantizer_params[0]
     if qscheme == torch.per_tensor_affine:
diff --git a/torch/serialization.py b/torch/serialization.py
index 22ff981a3909d..ca5a43c7bd599 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -8,6 +8,7 @@
 import tarfile
 import tempfile
 import warnings
+import copyreg
 from contextlib import closing, contextmanager
 from ._utils import _import_dotted_name
 from ._six import string_classes as _string_classes
@@ -429,6 +430,23 @@ def load(f, map_location=None, pickle_module=pickle, **pickle_load_args):
             f.close()
 
 
+# Register pickling support for layout instances such as
+# torch.sparse_coo, etc
+def _get_layout(name):
+    """Get layout extension object from its string representation.
+    """
+    cache = _get_layout.cache
+    if not cache:
+        for v in torch.__dict__.values():
+            if isinstance(v, torch.layout):
+                cache[str(v)] = v
+    return cache[name]
+
+
+_get_layout.cache = {}
+copyreg.pickle(torch.layout, lambda obj: (_get_layout, (str(obj),)))
+
+
 def _load(f, map_location, pickle_module, **pickle_load_args):
     deserialized_objects = {}
 
diff --git a/torch/tensor.py b/torch/tensor.py
index ac99e58ec7c3c..4c45299700de1 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -80,6 +80,16 @@ def __reduce_ex__(self, proto):
                     self.requires_grad,
                     OrderedDict())
             return (torch._utils._rebuild_qtensor, args)
+        elif self.is_sparse:
+            if self.layout == torch.sparse_coo:
+                args = (self.layout,
+                        (self._indices(),
+                         self._values(),
+                         self.size()))
+            else:
+                raise NotImplementedError(
+                    'sparse tensor __reduce_ex__ for layout `%s`' % (self.layout))
+            return (torch._utils._rebuild_sparse_tensor, args)
         else:
             args = (self.storage(),
                     self.storage_offset(),

From c389156fc4872ad4400046bf146abb6b9ec9e075 Mon Sep 17 00:00:00 2001
From: Sameer Deshmukh <sameer.deshmukh93@gmail.com>
Date: Fri, 4 Oct 2019 08:21:10 -0700
Subject: [PATCH 055/113] move new_zeros to core from THP (#26511)

Summary:
Fix for issue https://github.com/pytorch/pytorch/issues/25831

ezyang can you please have a look?
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26511

Differential Revision: D17763037

Pulled By: ezyang

fbshipit-source-id: 3596c01c4ab421e7785d6055cc813806f840a5c7
---
 aten/src/ATen/core/OpsAlreadyMovedToC10.cpp       |  1 +
 aten/src/ATen/core/TensorBody.h                   |  1 +
 aten/src/ATen/core/TensorMethods.h                |  9 +++++++++
 aten/src/ATen/native/TensorFactories.cpp          |  8 ++++++++
 aten/src/ATen/native/native_functions.yaml        |  3 +++
 .../templates/python_variable_methods.cpp         | 10 ----------
 tools/pyi/gen_pyi.py                              |  2 --
 torch/csrc/utils/tensor_new.cpp                   | 15 ---------------
 torch/csrc/utils/tensor_new.h                     |  1 -
 9 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp b/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
index f08f3321ccf57..e06b796202c1e 100644
--- a/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
+++ b/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
@@ -903,6 +903,7 @@ bool aten_op_is_not_moved_to_c10_yet(const c10::OperatorName& opName) {
         {"aten::empty", "memory_format"},
         {"aten::new_empty", ""},
         {"aten::new_full", ""},
+        {"aten::new_zeros", ""},
         {"aten::_empty_affine_quantized", ""},
         {"aten::_empty_per_channel_affine_quantized", ""},
         {"aten::empty", "out"},
diff --git a/aten/src/ATen/core/TensorBody.h b/aten/src/ATen/core/TensorBody.h
index 9de737b527314..edd2be2d1cdaf 100644
--- a/aten/src/ATen/core/TensorBody.h
+++ b/aten/src/ATen/core/TensorBody.h
@@ -507,6 +507,7 @@ class CAFFE2_API Tensor {
   Tensor dot(const Tensor & tensor) const;
   Tensor new_empty(IntArrayRef size, const TensorOptions & options={}) const;
   Tensor new_full(IntArrayRef size, Scalar fill_value, const TensorOptions & options={}) const;
+  Tensor new_zeros(IntArrayRef size, const TensorOptions & options={}) const;
   Tensor & resize_(IntArrayRef size) const;
   Tensor erf() const;
   Tensor & erf_() const;
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
index 9003a116aa42c..fdcf7d39944cc 100644
--- a/aten/src/ATen/core/TensorMethods.h
+++ b/aten/src/ATen/core/TensorMethods.h
@@ -1042,6 +1042,15 @@ inline Tensor Tensor::new_full(IntArrayRef size, Scalar fill_value, const Tensor
     return table->callUnboxed<Tensor, const Tensor &, IntArrayRef, Scalar, const TensorOptions &>(const_cast<Tensor&>(*this), size, fill_value, options);
 #endif
 }
+inline Tensor Tensor::new_zeros(IntArrayRef size, const TensorOptions & options) const {
+#ifdef USE_STATIC_DISPATCH
+    at::AutoNonVariableTypeMode _var_guard(true);
+    return TypeDefault::new_zeros(const_cast<Tensor&>(*this), size, options);
+#else
+    static auto table = globalATenDispatch().getOpTable("aten::new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor");
+    return table->callUnboxed<Tensor, const Tensor &, IntArrayRef, const TensorOptions &>(const_cast<Tensor&>(*this), size, options);
+#endif
+}
 inline Tensor & Tensor::resize_(IntArrayRef size) const {
 #ifdef USE_STATIC_DISPATCH
     at::AutoNonVariableTypeMode _var_guard(true);
diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp
index 0a1d7422a04b7..a8509851974f2 100644
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@@ -711,6 +711,14 @@ Tensor zeros_like(const Tensor& self, const TensorOptions& options) {
   return native::zeros(self.sizes(), options);
 }
 
+Tensor new_zeros(
+    const Tensor& self,
+    IntArrayRef size,
+    const TensorOptions& options
+    ) {
+  return at::zeros(size, self.options().merge_in(options));
+}
+
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ bartlett_window ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Tensor bartlett_window(int64_t window_length, const TensorOptions& options) {
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 910a6c867eb40..0799f015bbf55 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -1027,6 +1027,9 @@
 - func: new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
   variants: method
 
+- func: new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
+  variants: method
+
 # other overrides are to provide a more helpful error message that dtype is required
 - func: _empty_affine_quantized(int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, float scale=1, int zero_point=0, MemoryFormat? memory_format=contiguous_format) -> Tensor
   dispatch:
diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp
index a41ff6cb68956..d3ee543b27a6c 100644
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@@ -604,15 +604,6 @@ static PyObject * THPVariable_new_tensor(PyObject* self, PyObject* args, PyObjec
   END_HANDLE_TH_ERRORS
 }
 
-static PyObject * THPVariable_new_zeros(PyObject* self, PyObject* args, PyObject* kwargs)
-{
-  HANDLE_TH_ERRORS
-  auto& self_ = reinterpret_cast<THPVariable*>(self)->cdata;
-  OptionalDeviceGuard device_guard(device_of(self_));
-  return THPVariable_Wrap(torch::utils::new_zeros(legacyExtractTypeId(self_), self_.scalar_type(), args, kwargs));
-  END_HANDLE_TH_ERRORS
-}
-
 static PyObject * THPVariable_storage(PyObject* self, PyObject* arg)
 {
   HANDLE_TH_ERRORS
@@ -789,7 +780,6 @@ PyMethodDef variable_methods[] = {
   {"new", (PyCFunction)(void(*)(void))THPVariable_new, METH_VARARGS | METH_KEYWORDS, NULL},
   {"new_ones", (PyCFunction)(void(*)(void))THPVariable_new_ones, METH_VARARGS | METH_KEYWORDS, NULL},
   {"new_tensor", (PyCFunction)(void(*)(void))THPVariable_new_tensor, METH_VARARGS | METH_KEYWORDS, NULL},
-  {"new_zeros", (PyCFunction)(void(*)(void))THPVariable_new_zeros, METH_VARARGS | METH_KEYWORDS, NULL},
   {"nonzero", (PyCFunction)(void(*)(void))THPVariable_nonzero, METH_VARARGS | METH_KEYWORDS, NULL},
   {"numpy", (PyCFunction)THPVariable_numpy, METH_NOARGS, NULL},
   {"record_stream", (PyCFunction)THPVariable_record_stream, METH_O, NULL},
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index f7fd5a939d453..794f9202f141c 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -489,8 +489,6 @@ def gen_pyi(declarations_path, out):
                    'def stride(self, _int) -> _int: ...'],
         'new_ones': ['def new_ones(self, size: {}, {}) -> Tensor: ...'.
                      format(type_to_python('IntArrayRef'), FACTORY_PARAMS)],
-        'new_zeros': ['def new_zeros(self, size: {}, {}) -> Tensor: ...'.
-                      format(type_to_python('IntArrayRef'), FACTORY_PARAMS)],
         'new_tensor': ["def new_tensor(self, data: Any, {}) -> Tensor: ...".format(FACTORY_PARAMS)],
         # clamp has no default values in the Declarations
         'clamp': ["def clamp(self, min: _float=-inf, max: _float=inf,"
diff --git a/torch/csrc/utils/tensor_new.cpp b/torch/csrc/utils/tensor_new.cpp
index 08124c1a3411e..c2982468cf703 100644
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@@ -671,19 +671,4 @@ Tensor new_ones(c10::TensorTypeId type_id, at::ScalarType scalar_type, PyObject*
   throw std::runtime_error("new_ones(): invalid arguments");
 }
 
-Tensor new_zeros(c10::TensorTypeId type_id, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs) {
-  static PythonArgParser parser({
-    "new_zeros(IntArrayRef size, *, ScalarType dtype=None, Device? device=None, bool requires_grad=False)",
-  }, /*traceable=*/true);
-
-  ParsedArgs<4> parsed_args;
-  auto r = parser.parse(args, kwargs, parsed_args);
-  if (r.idx == 0) {
-    const auto actual_type_id = typeIdWithDefault(r, 2, type_id);
-    const auto actual_scalar_type = r.scalartypeWithDefault(1, scalar_type);
-    return dispatch_zeros(actual_type_id, actual_scalar_type, r.deviceOptional(2), r.intlist(0)).set_requires_grad(r.toBool(3));
-  }
-  throw std::runtime_error("new_zeros(): invalid arguments");
-}
-
 }} // namespace torch::utils
diff --git a/torch/csrc/utils/tensor_new.h b/torch/csrc/utils/tensor_new.h
index 76a9386986086..1fa7d75c131ac 100644
--- a/torch/csrc/utils/tensor_new.h
+++ b/torch/csrc/utils/tensor_new.h
@@ -18,6 +18,5 @@ at::Tensor tensor_ctor(c10::TensorTypeId type_id, at::ScalarType scalar_type, Py
 at::Tensor as_tensor(c10::TensorTypeId type_id, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 at::Tensor new_tensor(c10::TensorTypeId type_id, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 at::Tensor new_ones(c10::TensorTypeId type_id, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
-at::Tensor new_zeros(c10::TensorTypeId type_id, at::ScalarType scalar_type, PyObject* args, PyObject* kwargs);
 
 }} // namespace torch::utils

From da669c25ee60b0948ec5342d6a8746297a54b659 Mon Sep 17 00:00:00 2001
From: Anjali Chourdia <chourdiaanjali@fb.com>
Date: Fri, 4 Oct 2019 08:27:46 -0700
Subject: [PATCH 056/113] autograd: double backwards function for
 binary_cross_entropy loss

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/26983

Reviewed By: albanD

Differential Revision: D17714357

Pulled By: anjali411

fbshipit-source-id: cebfe09a9048c4be457b7f2718bc396c06ecabee
---
 test/common_nn.py                      | 10 +++----
 tools/autograd/derivatives.yaml        |  4 +++
 tools/autograd/templates/Functions.cpp | 36 ++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/test/common_nn.py b/test/common_nn.py
index e5e35a2e9b27e..694cf5960075e 100644
--- a/test/common_nn.py
+++ b/test/common_nn.py
@@ -295,8 +295,8 @@ def bceloss_no_reduce_test():
             lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
         input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
-        check_gradgrad=False,
-        pickle=False)
+        pickle=False,
+        precision=7e-4)
 
 
 def bceloss_no_reduce_scalar_test():
@@ -307,7 +307,6 @@ def bceloss_no_reduce_scalar_test():
             lambda i: F.binary_cross_entropy(i, t.type_as(i), reduction='none')),
         input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()),
-        check_gradgrad=False,
         pickle=False)
 
 
@@ -321,8 +320,8 @@ def bceloss_weights_no_reduce_test():
                                              weight=weights.type_as(i), reduction='none')),
         input_fn=lambda: torch.rand(15, 10).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, p, m: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
-        check_gradgrad=False,
-        pickle=False
+        pickle=False,
+        precision=3e-4
     )
 
 
@@ -336,7 +335,6 @@ def bceloss_weights_no_reduce_scalar_test():
                                              weight=weights.type_as(i), reduction='none')),
         input_fn=lambda: torch.rand(()).clamp_(2.8e-2, 1 - 2.8e-2),
         reference_fn=lambda i, *_: -(t * i.log() + (1 - t) * (1 - i).log()) * weights,
-        check_gradgrad=False,
         pickle=False
     )
 
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index c0d253e68ce5a..91fa35e379618 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -953,6 +953,10 @@
 - name: binary_cross_entropy(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
   self: binary_cross_entropy_backward(grad, self, target, weight, reduction)
 
+- name: binary_cross_entropy_backward(Tensor grad_output, Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean) -> Tensor
+  self: binary_cross_entropy_double_backward(grad_output, grad, self, target, weight, reduction)
+  grad_output: binary_cross_entropy_double_backward_grad_output(grad, self, target, weight, reduction)
+
 - name: binary_cross_entropy_with_logits(Tensor self, Tensor target, Tensor? weight=None, Tensor? pos_weight=None, int reduction=Mean) -> Tensor
   self: binary_cross_entropy_with_logits_backward(grad, self, target, weight, pos_weight, reduction)
   target: binary_cross_entropy_with_logits_target_backward(grad, self, target, weight, pos_weight, reduction)
diff --git a/tools/autograd/templates/Functions.cpp b/tools/autograd/templates/Functions.cpp
index 391aafd106fc0..b8991c746139b 100644
--- a/tools/autograd/templates/Functions.cpp
+++ b/tools/autograd/templates/Functions.cpp
@@ -902,6 +902,42 @@ Tensor log_softmax_double_backward(const Tensor & grad, const Tensor & grad_outp
   return z * grad_output.sum(dim, true) * ((grad * z).sum(dim, true) - grad);
 }
 
+Tensor binary_cross_entropy_double_backward(const Tensor & grad_output, const Tensor & grad, const Tensor & input, const Tensor & target, const Tensor& weight, int64_t reduction) {
+  auto eps = 1e-12;
+  auto inp_pl_eps = input + eps;
+  auto one_m_inp_pl_eps = 1 - input + eps;
+  // gradient wrt input
+  auto gI = (input * input - 2 * input * target + target) / (inp_pl_eps.pow(2) * one_m_inp_pl_eps.pow(2));
+  gI *= (grad * grad_output);
+
+  if (weight.defined()) {
+    gI *= weight;
+  }
+  if (reduction == Reduction::Mean) {
+    return gI / input.numel();
+  } else if (reduction == Reduction::Sum) {
+    return gI.sum();
+  }
+  return gI;
+}
+
+Tensor binary_cross_entropy_double_backward_grad_output(const Tensor & grad, const Tensor & input, const Tensor & target, const Tensor& weight, int64_t reduction) {
+  auto eps = 1e-12;
+  // gradient wrt grad_output
+  auto ggO = (input - target) / ((input + eps) * (1 - input + eps));
+  ggO *= grad;
+
+  if (weight.defined()) {
+    ggO *= weight;
+  }
+  if (reduction == Reduction::Mean) {
+    return ggO / input.numel();
+  } else if (reduction == Reduction::Sum) {
+    return ggO.sum();
+  }
+  return ggO;
+}
+
 Tensor l1_loss_double_backward_grad_output(const Tensor & grad, const Tensor & input, const Tensor & target, int64_t reduction) {
   auto output = l1_loss_backward(grad, input, target, Reduction::None);
   if (reduction == Reduction::Mean) {

From 28b1f586f669c6c14d5eabffa162b59f0f4d853d Mon Sep 17 00:00:00 2001
From: Vincent Quenneville-Belair <vincentqb@gmail.com>
Date: Fri, 4 Oct 2019 08:51:03 -0700
Subject: [PATCH 057/113] Change schedulers to chainable form (#26423)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26423

Enable chainable schedulers as requested in #13022 by implementing the changes mentioned below from [comment](https://github.com/pytorch/pytorch/pull/21800#issuecomment-513370208).

* Changing the behavior of schedulers to the chainable formula when available
* Using the closed form whenever epoch is different from None until the next release with a deprecation warning
* Making `get_computed_values` the supported way of obtaining the last computed learning rate by the scheduler (see [comment](https://github.com/pytorch/pytorch/pull/21800#issuecomment-513940729) for new syntax)
* Returning a deprecation warning when invoking the undocumented get_lr function (see [comment](https://github.com/pytorch/pytorch/pull/21800#discussion_r294305485)) referring to `get_computed_values`, and deprecating it in the next release.
* `CosineAnnealingWarmRestart` still takes an epoch parameter as it is the only one with a mechanic relying on fractional epoch
* `MultiplicativeLR` is consumes a function providing the multiplicative factor at each epoch. It mimics `LambdaLR` in its syntax.

# #20527

### Before

The user calls scheduler with a constant epoch either across loops or in the same loop.
```
import torch.optim as optim
from torch import nn

conv = nn.Conv2d(3,3,3)
optimizer = optim.Adam(conv.parameters())
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 2)

# Scheduler with sometimes-constant epoch number
for epoch in [0, 0, 1, 1, 2, 2, 3, 3]:
  lr_scheduler.step(epoch)
  print(optimizer.param_groups[0]['lr'])
```

### After

If the user wants to step
```
import torch.optim as optim
from torch import nn

conv = nn.Conv2d(3,3,3)
optimizer = optim.Adam(conv.parameters())
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, 2)

last_epoch = -1
for epoch in [0, 0, 1, 1, 2, 2, 3, 3]:

  # Check if epoch number has changed manually
  if epoch-last_epoch > 0:
    lr_scheduler.step()
  last_epoch = epoch

  print(epoch, scheduler.get_computed_values())
```

# #22107

### Before

```
import torch
from torchvision.models import resnet18
net = resnet18()

optimizer = torch.optim.SGD(net.parameters(), 0.1)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3, 6, 9], gamma=0.1)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.1)

for i in range(10):
  # Scheduler computes and returns new learning rate, leading to unexpected behavior
  print(i, scheduler.get_lr())
  scheduler.step()
```

### After

```
import torch
from torchvision.models import resnet18

net = resnet18()
optimizer = torch.optim.SGD(net.parameters(), 0.1)
lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3, 6, 9], gamma=0.1)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.1)

for i in range(10):
    # Returns last computed learning rate by scheduler
    print(i, lr_scheduler.get_computed_values())
    lr_scheduler.step()
```

# ghstack

This contains the changes from #24352. Opening again since they were reverted.

This reverts commit 1c477b7e1f378e9c1f8efed296241f68a8a4372b.

Test Plan: Imported from OSS

Differential Revision: D17460427

Pulled By: vincentqb

fbshipit-source-id: 8c10f4e7246d6756ac91df734e8bed65bdef63c9
---
 test/test_optim.py          | 294 +++++++++++++++++++++++++++---------
 torch/optim/lr_scheduler.py | 199 +++++++++++++++++++++---
 2 files changed, 401 insertions(+), 92 deletions(-)

diff --git a/test/test_optim.py b/test/test_optim.py
index 1bbd88c152cf8..7cd0c85e22924 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -3,7 +3,6 @@
 import unittest
 import functools
 from copy import deepcopy
-from bisect import bisect_right
 import torch
 from torch._six import inf
 import torch.optim as optim
@@ -11,9 +10,9 @@
 from torch.optim import SGD
 from torch.autograd import Variable
 from torch import sparse
-from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, \
-    ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, _LRScheduler, \
-    CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR
+from torch.optim.lr_scheduler import LambdaLR, StepLR, \
+    MultiStepLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \
+    _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR
 from common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -499,36 +498,6 @@ def __eq__(self, other):
             return False
 
 
-class LegacyStepLR(StepLR):
-    def get_lr(self):
-        return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
-                for base_lr in self.base_lrs]
-
-
-class LegacyMultiStepLR(MultiStepLR):
-    def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
-        self.milestones = sorted(milestones)
-        self.gamma = gamma
-        super(MultiStepLR, self).__init__(optimizer, last_epoch)
-
-    def get_lr(self):
-        return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch)
-                for base_lr in self.base_lrs]
-
-
-class LegacyExponentialLR(ExponentialLR):
-    def get_lr(self):
-        return [base_lr * self.gamma ** self.last_epoch
-                for base_lr in self.base_lrs]
-
-
-class LegacyCosineAnnealingLR(CosineAnnealingLR):
-    def get_lr(self):
-        return [self.eta_min + (base_lr - self.eta_min) *
-                (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
-                for base_lr in self.base_lrs]
-
-
 class TestLRScheduler(TestCase):
     def setUp(self):
         super(TestLRScheduler, self).setUp()
@@ -571,7 +540,7 @@ def test_old_pattern_warning(self):
             self.assertTrue(len(ws) == 0, "No warning should be raised")
 
         def old_pattern():
-            for e in range(epochs):
+            for _ in range(epochs):
                 scheduler.step()
                 self.opt.step()
 
@@ -585,8 +554,8 @@ def test_old_pattern_warning_with_arg(self):
             self.assertTrue(len(ws) == 0, "No warning should be raised")
 
         def old_pattern2():
-            for e in range(epochs):
-                scheduler.step(e)
+            for _ in range(epochs):
+                scheduler.step()
                 self.opt.step()
 
         self.assertWarnsRegex(old_pattern2, r'how-to-adjust-learning-rate')
@@ -602,7 +571,7 @@ def test_old_pattern_warning_resuming(self):
             self.assertTrue(len(ws) == 0, "No warning should be raised")
 
         def old_pattern():
-            for e in range(epochs):
+            for _ in range(epochs):
                 scheduler.step()
                 self.opt.step()
 
@@ -619,13 +588,13 @@ def test_old_pattern_warning_resuming_with_arg(self):
             self.assertTrue(len(ws) == 0, "No warning should be raised")
 
         def old_pattern2():
-            for e in range(epochs):
-                scheduler.step(e)
+            for _ in range(epochs):
+                scheduler.step()
                 self.opt.step()
 
         self.assertWarnsRegex(old_pattern2, r'how-to-adjust-learning-rate')
 
-    def test_old_pattern_warning_with_overriden_optim_step(self):
+    def test_old_pattern_warning_with_overridden_optim_step(self):
         epochs = 35
         for i, group in enumerate(self.opt.param_groups):
             group['initial_lr'] = 0.01
@@ -635,7 +604,7 @@ def test_old_pattern_warning_with_overriden_optim_step(self):
             scheduler = StepLR(self.opt, gamma=0.1, step_size=3, last_epoch=10)
             self.assertTrue(len(ws) == 0, "No warning should be raised")
 
-        # emulate use-case with optimizer.step overriden
+        # emulate use-case with optimizer.step overridden
         import types
 
         old_step = self.opt.step
@@ -647,8 +616,8 @@ def new_step(o, *args, **kwargs):
         self.opt.step = types.MethodType(new_step, self.opt)
 
         def old_pattern2():
-            for e in range(epochs):
-                scheduler.step(e)
+            for _ in range(epochs):
+                scheduler.step()
                 self.opt.step()
 
         self.assertWarnsRegex(old_pattern2, r'how-to-adjust-learning-rate')
@@ -662,7 +631,7 @@ def test_new_pattern_no_warning(self):
 
         with warnings.catch_warnings(record=True) as ws:
             warnings.simplefilter("always")  # allow any warning to be raised
-            for e in range(epochs):                
+            for _ in range(epochs):
                 self.opt.step()
                 scheduler.step()
             self.assertTrue(len(ws) == 0, "No warning should be raised")
@@ -676,19 +645,19 @@ def test_new_pattern_no_warning_with_arg(self):
 
         with warnings.catch_warnings(record=True) as ws:
             warnings.simplefilter("always")  # allow any warning to be raised
-            for e in range(epochs):                
+            for _ in range(epochs):
                 self.opt.step()
-                scheduler.step(e)
+                scheduler.step()
             self.assertTrue(len(ws) == 0, "No warning should be raised")
 
-    def test_new_pattern_no_warning_with_overriden_optim_step(self):
+    def test_new_pattern_no_warning_with_overridden_optim_step(self):
         epochs = 35
         with warnings.catch_warnings(record=True) as ws:
             warnings.simplefilter("always")  # allow any warning to be raised
             scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
             self.assertTrue(len(ws) == 0, "No warning should be raised")
 
-        # emulate use-case with optimizer.step overriden
+        # emulate use-case with optimizer.step overridden
         import types
 
         old_step = self.opt.step
@@ -706,6 +675,23 @@ def new_pattern():
 
         self.assertWarnsRegex(new_pattern, r'`optimizer.step\(\)` has been overridden')
 
+    def _test_lr_is_constant_for_constant_epoch(self, scheduler):
+        l = []
+
+        # warnings.filterwarnings("ignore", category=DeprecationWarning)
+        for _ in range(10):
+            scheduler.step(2)
+            l.append(self.opt.param_groups[0]['lr'])
+        self.assertAlmostEqual(min(l), max(l))
+
+    def test_step_lr_is_constant_for_constant_epoch(self):
+        scheduler = StepLR(self.opt, 2)
+        self._test_lr_is_constant_for_constant_epoch(scheduler)
+
+    def test_exponential_lr_is_constant_for_constant_epoch(self):
+        scheduler = ExponentialLR(self.opt, gamma=0.9)
+        self._test_lr_is_constant_for_constant_epoch(scheduler)
+
     def test_step_lr(self):
         # lr = 0.05     if epoch < 3
         # lr = 0.005    if 30 <= epoch < 6
@@ -716,6 +702,25 @@ def test_step_lr(self):
         scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
         self._test(scheduler, targets, epochs)
 
+    def test_get_last_lr_step_lr(self):
+        from torch.nn import Parameter
+        epochs = 10
+        optimizer = torch.optim.SGD([Parameter(torch.randn(2, 2, requires_grad=True))], 0.1)
+        targets = [[0.1] * 3 + [0.01] * 3 + [0.001] * 3 + [0.0001]]
+        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3, gamma=0.1)
+        self._test_get_last_lr(scheduler, targets, epochs)
+
+    def test_get_last_lr_multi_step_lr(self):
+        # lr = 0.05     if epoch < 2
+        # lr = 0.005    if 2 <= epoch < 5
+        # lr = 0.0005   if 5 <= epoch < 9
+        # lr = 0.00005   if 9 <= epoch
+        epochs = 10
+        single_targets = [0.05] * 2 + [0.005] * 3 + [0.0005] * 4 + [0.00005] * 1
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+        self._test_get_last_lr(scheduler, targets, epochs)
+
     def test_multi_step_lr(self):
         # lr = 0.05     if epoch < 2
         # lr = 0.005    if 2 <= epoch < 5
@@ -744,28 +749,28 @@ def test_cos_anneal_lr(self):
         scheduler = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
         self._test(scheduler, targets, epochs)
 
-    def test_legacy_step_lr(self):
+    def test_closed_form_step_lr(self):
         scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
-        legacy_scheduler = LegacyStepLR(self.opt, gamma=0.1, step_size=3)
-        self._test_against_legacy(scheduler, legacy_scheduler, 20)
+        closed_form_scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
+        self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
-    def test_legacy_multi_step_lr(self):
+    def test_closed_form_multi_step_lr(self):
         scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
-        legacy_scheduler = LegacyMultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
-        self._test_against_legacy(scheduler, legacy_scheduler, 20)
+        closed_form_scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+        self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
-    def test_legacy_exp_lr(self):
+    def test_closed_form_exp_lr(self):
         scheduler = ExponentialLR(self.opt, gamma=0.9)
-        legacy_scheduler = LegacyExponentialLR(self.opt, gamma=0.9)
-        self._test_against_legacy(scheduler, legacy_scheduler, 20)
+        closed_form_scheduler = ExponentialLR(self.opt, gamma=0.9)
+        self._test_against_closed_form(scheduler, closed_form_scheduler, 20)
 
-    def test_legacy_cos_anneal_lr(self):
+    def test_closed_form_cos_anneal_lr(self):
         eta_min = 1e-10
         epochs = 20
         T_max = 5
         scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min)
-        legacy_scheduler = LegacyCosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min)
-        self._test_against_legacy(scheduler, legacy_scheduler, epochs)
+        closed_form_scheduler = CosineAnnealingLR(self.opt, T_max=T_max, eta_min=eta_min)
+        self._test_against_closed_form(scheduler, closed_form_scheduler, epochs)
 
     def test_reduce_lr_on_plateau1(self):
         epochs = 10
@@ -847,6 +852,145 @@ def test_reduce_lr_on_plateau8(self):
                                       threshold=0.1, patience=5, cooldown=5)
         self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
 
+    def test_compound_step_and_multistep_lr(self):
+        epochs = 10
+        schedulers = [None] * 2
+        schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
+        schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+        targets = [[0.05] * 2 + [0.005] * 1 + [5e-4] * 2 + [5e-5] + [5e-6] * 3 + [5e-8]]
+        self._test(schedulers, targets, epochs)
+
+    def test_compound_step_and_exp_lr(self):
+        epochs = 10
+        schedulers = [None] * 2
+        single_targets = [0.05 * (0.9 ** x) for x in range(3)]
+        single_targets += [0.005 * (0.9 ** x) for x in range(3, 6)]
+        single_targets += [0.0005 * (0.9 ** x) for x in range(6, 9)]
+        single_targets += [0.00005 * (0.9 ** x) for x in range(9, 12)]
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
+        schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
+        self._test(schedulers, targets, epochs)
+
+    def test_compound_exp_and_multistep_lr(self):
+        epochs = 10
+        schedulers = [None] * 2
+        single_targets = [0.05 * (0.9 ** x) for x in range(2)]
+        single_targets += [0.005 * (0.9 ** x) for x in range(2, 5)]
+        single_targets += [0.0005 * (0.9 ** x) for x in range(5, 9)]
+        single_targets += [0.00005 * (0.9 ** x) for x in range(9, 11)]
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        schedulers[0] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+        schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
+        self._test(schedulers, targets, epochs)
+
+    def test_compound_cosanneal_and_step_lr(self):
+        epochs = 10
+        eta_min = 1e-10
+        single_targets = [eta_min + (0.05 - eta_min) *
+                          (1 + math.cos(math.pi * x / epochs)) / 2
+                          for x in range(epochs)]
+        single_targets = [x * 0.1 ** (i // 3) for i, x in enumerate(single_targets)]
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        schedulers = [None] * 2
+        schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+        schedulers[1] = StepLR(self.opt, gamma=0.1, step_size=3)
+        self._test(schedulers, targets, epochs)
+
+    def test_compound_cosanneal_and_multistep_lr(self):
+        epochs = 10
+        eta_min = 1e-10
+        single_targets = [eta_min + (0.05 - eta_min) *
+                          (1 + math.cos(math.pi * x / epochs)) / 2
+                          for x in range(epochs)]
+        multipliers = [1] * 2 + [0.1] * 3 + [0.01] * 4 + [0.001]
+        single_targets = [x * y for x, y in zip(single_targets, multipliers)]
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        schedulers = [None] * 2
+        schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+        schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+        self._test(schedulers, targets, epochs)
+
+    def test_compound_cosanneal_and_exp_lr(self):
+        epochs = 10
+        eta_min = 1e-10
+        single_targets = [eta_min + (0.05 - eta_min) *
+                          (1 + math.cos(math.pi * x / epochs)) / 2
+                          for x in range(epochs)]
+        multipliers = [0.1 ** i for i in range(epochs)]
+        single_targets = [x * y for x, y in zip(single_targets, multipliers)]
+        targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+        schedulers = [None] * 2
+        schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+        schedulers[1] = ExponentialLR(self.opt, gamma=0.1)
+        self._test(schedulers, targets, epochs)
+
+    def test_compound_reduce_lr_on_plateau1(self):
+        epochs = 10
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        single_targets = [0.5] * 20
+        multipliers = [0.1 ** (i // 3) for i in range(20)]
+        single_targets = [x * y for x, y in zip(multipliers, single_targets)]
+        targets = [single_targets]
+        targets = targets[1:]  # test runs step before checking lr
+        metrics = [10 - i * 0.0167 for i in range(20)]
+        schedulers = [None, None]
+        schedulers[0] = ReduceLROnPlateau(self.opt, threshold_mode='abs', mode='min',
+                                          threshold=0.01, patience=5, cooldown=5)
+        schedulers[1] = StepLR(self.opt, gamma=0.1, step_size=3)
+        self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
+    def test_compound_reduce_lr_on_plateau2(self):
+        epochs = 22
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        single_targets = [0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2
+        multipliers = [1] * 3 + [0.1] * 5 + [0.01] * 4 + [0.001] * 10
+        single_targets = [x * y for x, y in zip(single_targets, multipliers)]
+        targets = [single_targets]
+        targets = targets[1:]  # test runs step before checking lr
+        metrics = [10 - i * 0.0165 for i in range(22)]
+        schedulers = [None] * 2
+        schedulers[0] = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
+                                          mode='min', threshold=0.1)
+        schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[3, 8, 12])
+        self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
+    def test_compound_reduce_lr_on_plateau3(self):
+        epochs = 22
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.5
+        single_targets = [0.5] * (2 + 6) + [0.05] * (5 + 6) + [0.005] * 4
+        multipliers = [0.1 ** i for i in range(epochs)]
+        single_targets = [x * y for x, y in zip(multipliers, single_targets)]
+        targets = [single_targets]
+        targets = targets[1:]  # test runs step before checking lr
+        metrics = [-0.8] * 2 + [-0.234] * 20
+        schedulers = [None, None]
+        schedulers[0] = ReduceLROnPlateau(self.opt, mode='max', patience=5, cooldown=5,
+                                          threshold_mode='abs')
+        schedulers[1] = ExponentialLR(self.opt, gamma=0.1)
+        self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
+    def test_compound_reduce_lr_on_plateau4(self):
+        epochs = 20
+        for param_group in self.opt.param_groups:
+            param_group['lr'] = 0.05
+        epochs = 10
+        eta_min = 1e-10
+        single_targets = [eta_min + (0.05 - eta_min) *
+                          (1 + math.cos(math.pi * x / epochs)) / 2
+                          for x in range(epochs)]
+        targets = [single_targets]
+        targets = targets[1:]  # test runs step before checking lr
+        metrics = [1.5 * (1.025 ** i) for i in range(20)]  # 1.025 > 1.1**0.25
+        schedulers = [None, None]
+        schedulers[0] = ReduceLROnPlateau(self.opt, mode='max', patience=3,
+                                          threshold_mode='rel', threshold=0.1)
+        schedulers[1] = CosineAnnealingLR(self.opt, epochs, eta_min)
+        self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
     def test_cycle_lr_invalid_mode(self):
         with self.assertRaises(ValueError):
             scheduler = CyclicLR(self.opt, base_lr=0, max_lr=0, mode="CATS")
@@ -1220,17 +1364,30 @@ def _check_scheduler_state_dict(self, constr, constr2, epochs=10):
         for key in scheduler.__dict__.keys():
             if key != 'optimizer':
                 self.assertAlmostEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key])
-        self.assertAlmostEqual(scheduler.get_lr(), scheduler_copy.get_lr())
+        self.assertAlmostEqual(scheduler.get_last_lr(), scheduler_copy.get_last_lr())
+
+    def _test_get_last_lr(self, schedulers, targets, epochs=10):
+        if isinstance(schedulers, _LRScheduler):
+            schedulers = [schedulers]
+        for epoch in range(epochs):
+            result = [scheduler.get_last_lr() for scheduler in schedulers]
+            [scheduler.step() for scheduler in schedulers]
+            target = [[t[epoch] for t in targets]] * len(schedulers)
+            # print(target)
+            for t, r in zip(target, result):
+                self.assertAlmostEqual(target, result,
+                                       msg='LR is wrong in epoch {}: expected {}, got {}'.format(
+                                           epoch, t, r), delta=1e-5)
 
     def _test(self, schedulers, targets, epochs=10):
         if isinstance(schedulers, _LRScheduler):
             schedulers = [schedulers]
         for epoch in range(epochs):
-            [scheduler.step(epoch) for scheduler in schedulers]
             for param_group, target in zip(self.opt.param_groups, targets):
                 self.assertAlmostEqual(target[epoch], param_group['lr'],
                                        msg='LR is wrong in epoch {}: expected {}, got {}'.format(
                                            epoch, target[epoch], param_group['lr']), delta=1e-5)
+            [scheduler.step() for scheduler in schedulers]
 
     def _test_CosineAnnealingWarmRestarts(self, scheduler, targets, epochs=10):
         for index, epoch in enumerate(torch.arange(0, epochs, 0.1)):
@@ -1249,15 +1406,16 @@ def _test_interleaved_CosineAnnealingWarmRestarts(self, scheduler, targets, epoc
                                        msg='LR is wrong in epoch {}: expected {}, got {}'.format(
                                            epoch, target[index], param_group['lr']), delta=1e-5)
 
-    def _test_against_legacy(self, scheduler, legacy_scheduler, epochs=10):
+    def _test_against_closed_form(self, scheduler, closed_form_scheduler, epochs=10):
         self.setUp()
         targets = []
+        # warnings.filterwarnings("ignore", category=DeprecationWarning)
         for epoch in range(epochs):
-            legacy_scheduler.step(epoch)
+            closed_form_scheduler.step(epoch)
             targets.append([group['lr'] for group in self.opt.param_groups])
         self.setUp()
         for epoch in range(epochs):
-            scheduler.step(epoch)
+            scheduler.step()
             for i, param_group in enumerate(self.opt.param_groups):
                 self.assertAlmostEqual(targets[epoch][i], param_group['lr'],
                                        msg='LR is wrong in epoch {}: expected {}, got {}'.format(
@@ -1271,7 +1429,7 @@ def _test_reduce_lr_on_plateau(self, schedulers, targets, metrics, epochs=10, ve
                 if isinstance(scheduler, ReduceLROnPlateau):
                     scheduler.step(metrics[epoch])
                 else:
-                    scheduler.step(epoch)
+                    scheduler.step()
             if verbose:
                 print('epoch{}:\tlr={}'.format(epoch, self.opt.param_groups[0]['lr']))
             for param_group, target in zip(self.opt.param_groups, targets):
@@ -1281,7 +1439,6 @@ def _test_reduce_lr_on_plateau(self, schedulers, targets, metrics, epochs=10, ve
 
     def _test_cycle_lr(self, scheduler, lr_targets, momentum_targets, batch_iterations, verbose=False, use_beta1=False):
         for batch_num in range(batch_iterations):
-            scheduler.step(batch_num)
             if verbose:
                 if 'momentum' in self.opt.param_groups[0].keys():
                     print('batch{}:\tlr={},momentum={}'.format(batch_num, self.opt.param_groups[0]['lr'],
@@ -1308,6 +1465,7 @@ def _test_cycle_lr(self, scheduler, lr_targets, momentum_targets, batch_iteratio
                         momentum_target[batch_num], param_group['momentum'],
                         msg='Momentum is wrong in batch_num {}: expected {}, got {}'.format(
                             batch_num, momentum_target[batch_num], param_group['momentum']), delta=1e-5)
+            scheduler.step()
 
     def test_cosine_then_cyclic(self):
         # https://github.com/pytorch/pytorch/issues/21965
diff --git a/torch/optim/lr_scheduler.py b/torch/optim/lr_scheduler.py
index 8e75a4b7667bb..64d342ddc7eb0 100644
--- a/torch/optim/lr_scheduler.py
+++ b/torch/optim/lr_scheduler.py
@@ -4,21 +4,36 @@
 from functools import wraps
 import warnings
 import weakref
+from collections import Counter
 from bisect import bisect_right
 
 from .optimizer import Optimizer
 
 
+EPOCH_DEPRECATION_WARNING = (
+    "The epoch parameter in `scheduler.step()` was not necessary and is being "
+    "deprecated where possible. Please use `scheduler.step()` to step the "
+    "scheduler. During the deprecation, if epoch is different from None, the "
+    "closed form is used instead of the new chainable form, where available. "
+    "Please open an issue if you are unable to replicate your use case: "
+    "https://github.com/pytorch/pytorch/issues/new/choose."
+)
+
+
 class _LRScheduler(object):
+
     def __init__(self, optimizer, last_epoch=-1):
+
+        # Attach optimizer
         if not isinstance(optimizer, Optimizer):
             raise TypeError('{} is not an Optimizer'.format(
                 type(optimizer).__name__))
         self.optimizer = optimizer
+
+        # Initialize epoch and base learning rates
         if last_epoch == -1:
             for group in optimizer.param_groups:
                 group.setdefault('initial_lr', group['lr'])
-            last_epoch = 0
         else:
             for i, group in enumerate(optimizer.param_groups):
                 if 'initial_lr' not in group:
@@ -58,7 +73,8 @@ def wrapper(*args, **kwargs):
         self.optimizer.step = with_counter(self.optimizer.step)
         self.optimizer._step_count = 0
         self._step_count = 0
-        self.step(last_epoch)
+
+        self.step()
 
     def state_dict(self):
         """Returns the state of the scheduler as a :class:`dict`.
@@ -77,7 +93,13 @@ def load_state_dict(self, state_dict):
         """
         self.__dict__.update(state_dict)
 
+    def get_last_lr(self):
+        """ Return last computed learning rate by current scheduler.
+        """
+        return self._last_lr
+
     def get_lr(self):
+        # Compute learning rate using chainable form of the scheduler
         raise NotImplementedError
 
     def step(self, epoch=None):
@@ -100,12 +122,36 @@ def step(self, epoch=None):
                               "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate", UserWarning)
         self._step_count += 1
 
-        if epoch is None:
-            epoch = self.last_epoch + 1
-        self.last_epoch = epoch
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+        class _enable_get_lr_call:
+
+            def __init__(self, o):
+                self.o = o
+
+            def __enter__(self):
+                self.o._get_lr_called_within_step = True
+                return self
+
+            def __exit__(self, type, value, traceback):
+                self.o._get_lr_called_within_step = False
+                return self
+
+        with _enable_get_lr_call(self):
+            if epoch is None:
+                self.last_epoch += 1
+                values = self.get_lr()
+            else:
+                warnings.warn(EPOCH_DEPRECATION_WARNING, DeprecationWarning)
+                self.last_epoch = epoch
+                if hasattr(self, "_get_closed_form_lr"):
+                    values = self._get_closed_form_lr()
+                else:
+                    values = self.get_lr()
+
+        for param_group, lr in zip(self.optimizer.param_groups, values):
             param_group['lr'] = lr
 
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
 
 class LambdaLR(_LRScheduler):
     """Sets the learning rate of each parameter group to the initial lr
@@ -131,6 +177,7 @@ class LambdaLR(_LRScheduler):
 
     def __init__(self, optimizer, lr_lambda, last_epoch=-1):
         self.optimizer = optimizer
+
         if not isinstance(lr_lambda, list) and not isinstance(lr_lambda, tuple):
             self.lr_lambdas = [lr_lambda] * len(optimizer.param_groups)
         else:
@@ -173,14 +220,19 @@ def load_state_dict(self, state_dict):
                 self.lr_lambdas[idx].__dict__.update(fn)
 
     def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.")
+
         return [base_lr * lmbda(self.last_epoch)
                 for lmbda, base_lr in zip(self.lr_lambdas, self.base_lrs)]
 
 
 class StepLR(_LRScheduler):
-    """Sets the learning rate of each parameter group to the initial lr
-    decayed by gamma every step_size epochs. When last_epoch=-1, sets
-    initial lr as lr.
+    """Decays the learning rate of each parameter group by gamma every
+    step_size epochs. Notice that such decay can happen simultaneously with
+    other changes to the learning rate from outside this scheduler. When
+    last_epoch=-1, sets initial lr as lr.
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
@@ -208,14 +260,25 @@ def __init__(self, optimizer, step_size, gamma=0.1, last_epoch=-1):
         super(StepLR, self).__init__(optimizer, last_epoch)
 
     def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", DeprecationWarning)
+
+        if (self.last_epoch == 0) or (self.last_epoch % self.step_size != 0):
+            return [group['lr'] for group in self.optimizer.param_groups]
+        return [group['lr'] * self.gamma
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
         return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
                 for base_lr in self.base_lrs]
 
 
 class MultiStepLR(_LRScheduler):
-    """Set the learning rate of each parameter group to the initial lr decayed
-    by gamma once the number of epoch reaches one of the milestones. When
-    last_epoch=-1, sets initial lr as lr.
+    """Decays the learning rate of each parameter group by gamma once the
+    number of epoch reaches one of the milestones. Notice that such decay can
+    happen simultaneously with other changes to the learning rate from outside
+    this scheduler. When last_epoch=-1, sets initial lr as lr.
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
@@ -237,21 +300,28 @@ class MultiStepLR(_LRScheduler):
     """
 
     def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
-        if not list(milestones) == sorted(milestones):
-            raise ValueError('Milestones should be a list of'
-                             ' increasing integers. Got {}', milestones)
-        self.milestones = milestones
+        self.milestones = Counter(milestones)
         self.gamma = gamma
         super(MultiStepLR, self).__init__(optimizer, last_epoch)
 
     def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", DeprecationWarning)
+
+        if self.last_epoch not in self.milestones:
+            return [group['lr'] for group in self.optimizer.param_groups]
+        return [group['lr'] * self.gamma ** self.milestones[self.last_epoch]
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
         return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch)
                 for base_lr in self.base_lrs]
 
 
 class ExponentialLR(_LRScheduler):
-    """Set the learning rate of each parameter group to the initial lr decayed
-    by gamma every epoch. When last_epoch=-1, sets initial lr as lr.
+    """Decays the learning rate of each parameter group by gamma every epoch.
+    When last_epoch=-1, sets initial lr as lr.
 
     Args:
         optimizer (Optimizer): Wrapped optimizer.
@@ -264,6 +334,16 @@ def __init__(self, optimizer, gamma, last_epoch=-1):
         super(ExponentialLR, self).__init__(optimizer, last_epoch)
 
     def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", DeprecationWarning)
+
+        if self.last_epoch == 0:
+            return self.base_lrs
+        return [group['lr'] * self.gamma
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
         return [base_lr * self.gamma ** self.last_epoch
                 for base_lr in self.base_lrs]
 
@@ -273,12 +353,23 @@ class CosineAnnealingLR(_LRScheduler):
     schedule, where :math:`\eta_{max}` is set to the initial lr and
     :math:`T_{cur}` is the number of epochs since the last restart in SGDR:
 
+    .. math::
+        \eta_{t+1} = \eta_{min} + (\eta_t - \eta_{min})\frac{1 +
+        \cos(\frac{T_{cur}+1}{T_{max}}\pi)}{1 + \cos(\frac{T_{cur}}{T_{max}}\pi)},
+        T_{cur} \neq (2k+1)T_{max};\\
+        \eta_{t+1} = \eta_{t} + (\eta_{max} - \eta_{min})\frac{1 -
+        \cos(\frac{1}{T_{max}}\pi)}{2},
+        T_{cur} = (2k+1)T_{max}.\\
+
+    When last_epoch=-1, sets initial lr as lr. Notice that because the schedule
+    is defined recursively, the learning rate can be simultaneously modified
+    outside this scheduler by other operators. If the learning rate is set
+    solely by this scheduler, the learning rate at each step becomes:
+
     .. math::
         \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
         \cos(\frac{T_{cur}}{T_{max}}\pi))
 
-    When last_epoch=-1, sets initial lr as lr.
-
     It has been proposed in
     `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Note that this only
     implements the cosine annealing part of SGDR, and not the restarts.
@@ -299,6 +390,23 @@ def __init__(self, optimizer, T_max, eta_min=0, last_epoch=-1):
         super(CosineAnnealingLR, self).__init__(optimizer, last_epoch)
 
     def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", DeprecationWarning)
+
+        if self.last_epoch == 0:
+            return self.base_lrs
+        elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
+            return [group['lr'] + (base_lr - self.eta_min) *
+                    (1 - math.cos(math.pi / self.T_max)) / 2
+                    for base_lr, group in
+                    zip(self.base_lrs, self.optimizer.param_groups)]
+        return [(1 + math.cos(math.pi * self.last_epoch / self.T_max)) /
+                (1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) *
+                (group['lr'] - self.eta_min) + self.eta_min
+                for group in self.optimizer.param_groups]
+
+    def _get_closed_form_lr(self):
         return [self.eta_min + (base_lr - self.eta_min) *
                 (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
                 for base_lr in self.base_lrs]
@@ -361,6 +469,7 @@ def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
             raise ValueError('Factor should be < 1.0.')
         self.factor = factor
 
+        # Attach optimizer
         if not isinstance(optimizer, Optimizer):
             raise TypeError('{} is not an Optimizer'.format(
                 type(optimizer).__name__))
@@ -385,7 +494,7 @@ def __init__(self, optimizer, mode='min', factor=0.1, patience=10,
         self.num_bad_epochs = None
         self.mode_worse = None  # the worse value for the chosen mode
         self.eps = eps
-        self.last_epoch = -1
+        self.last_epoch = 0
         self._init_is_better(mode=mode, threshold=threshold,
                              threshold_mode=threshold_mode)
         self._reset()
@@ -400,7 +509,9 @@ def step(self, metrics, epoch=None):
         # convert `metrics` to float, in case it's a zero-dim Tensor
         current = float(metrics)
         if epoch is None:
-            epoch = self.last_epoch = self.last_epoch + 1
+            epoch = self.last_epoch + 1
+        else:
+            warnings.warn(EPOCH_DEPRECATION_WARNING, DeprecationWarning)
         self.last_epoch = epoch
 
         if self.is_better(current, self.best):
@@ -418,6 +529,8 @@ def step(self, metrics, epoch=None):
             self.cooldown_counter = self.cooldown
             self.num_bad_epochs = 0
 
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
     def _reduce_lr(self, epoch):
         for i, param_group in enumerate(self.optimizer.param_groups):
             old_lr = float(param_group['lr'])
@@ -579,6 +692,7 @@ def __init__(self,
                  max_momentum=0.9,
                  last_epoch=-1):
 
+        # Attach optimizer
         if not isinstance(optimizer, Optimizer):
             raise TypeError('{} is not an Optimizer'.format(
                 type(optimizer).__name__))
@@ -658,6 +772,11 @@ def get_lr(self):
         If `self.cycle_momentum` is ``True``, this function has a side effect of
         updating the optimizer's momentum.
         """
+
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", DeprecationWarning)
+
         cycle = math.floor(1 + self.last_epoch / self.total_size)
         x = 1. + self.last_epoch / self.total_size - cycle
         if x <= self.step_ratio:
@@ -725,10 +844,16 @@ def __init__(self, optimizer, T_0, T_mult=1, eta_min=0, last_epoch=-1):
         self.T_i = T_0
         self.T_mult = T_mult
         self.eta_min = eta_min
+
         super(CosineAnnealingWarmRestarts, self).__init__(optimizer, last_epoch)
+
         self.T_cur = self.last_epoch
 
     def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", DeprecationWarning)
+
         return [self.eta_min + (base_lr - self.eta_min) * (1 + math.cos(math.pi * self.T_cur / self.T_i)) / 2
                 for base_lr in self.base_lrs]
 
@@ -757,6 +882,10 @@ def step(self, epoch=None):
             >>> scheduler.step(26)
             >>> scheduler.step() # scheduler.step(27), instead of scheduler(20)
         """
+
+        if epoch is None and self.last_epoch < 0:
+            epoch = 0
+
         if epoch is None:
             epoch = self.last_epoch + 1
             self.T_cur = self.T_cur + 1
@@ -777,8 +906,26 @@ def step(self, epoch=None):
                 self.T_i = self.T_0
                 self.T_cur = epoch
         self.last_epoch = math.floor(epoch)
-        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
-            param_group['lr'] = lr
+
+        class _enable_get_lr_call:
+
+            def __init__(self, o):
+                self.o = o
+
+            def __enter__(self):
+                self.o._get_lr_called_within_step = True
+                return self
+
+            def __exit__(self, type, value, traceback):
+                self.o._get_lr_called_within_step = False
+                return self
+
+        with _enable_get_lr_call(self):
+            for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+                param_group['lr'] = lr
+
+        self._last_lr = [group['lr'] for group in self.optimizer.param_groups]
+
 
 class OneCycleLR(_LRScheduler):
     r"""Sets the learning rate of each parameter group according to the
@@ -970,6 +1117,10 @@ def _annealing_linear(self, start, end, pct):
         return (end - start) * pct + start
 
     def get_lr(self):
+        if not self._get_lr_called_within_step:
+            warnings.warn("To get the last learning rate computed by the scheduler, "
+                          "please use `get_last_lr()`.", DeprecationWarning)
+
         lrs = []
         step_num = self.last_epoch
 

From e166bcbbde807fa78917a40b04681e7d8d9a1f57 Mon Sep 17 00:00:00 2001
From: Shihao Xu <shihaoxu@fb.com>
Date: Fri, 4 Oct 2019 09:18:29 -0700
Subject: [PATCH 058/113] Make RpcTest re-usable by other RPC backends by using
 init_method to initialize a RPC backend (#27320)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27320

https://github.com/pytorch/pytorch/pull/27208/

# Problem

Other RPC backends take init_method.

# Solution

Set up init_method in rpc tests.
ghstack-source-id: 91335127

Differential Revision: D17709219

fbshipit-source-id: 3184c6e9b922a6ff9f4d1cb9abfa118b23f43eeb
---
 test/dist_autograd_test.py |  73 ++++++++----
 test/dist_utils.py         |  41 ++++---
 test/rpc_test.py           | 221 +++++++++++++------------------------
 3 files changed, 153 insertions(+), 182 deletions(-)

diff --git a/test/dist_autograd_test.py b/test/dist_autograd_test.py
index e89a28f4c51c4..c6eb071e07b83 100644
--- a/test/dist_autograd_test.py
+++ b/test/dist_autograd_test.py
@@ -1,45 +1,64 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import torch.distributed as dist
-import torch.distributed.autograd as dist_autograd
-from dist_utils import dist_init
-import six
+import time
 import unittest
+
+import six
 import torch
-import time
+import torch.distributed as dist
+import torch.distributed.autograd as dist_autograd
+from dist_utils import INIT_METHOD_TEMPLATE, dist_init
+
 
 prev_rank_rpc_done = False
 prev_rank_context_id = 0
+
+
 def _set_rpc_done(context_id):
     global prev_rank_rpc_done
     global prev_rank_context_id
     prev_rank_rpc_done = True
     prev_rank_context_id = context_id
 
-@unittest.skipIf(not six.PY3, "Pytorch distributed autograd package "
-                 "does not support python2")
-class DistAutogradTest(object):
 
+@unittest.skipIf(
+    not six.PY3, "Pytorch distributed autograd package " "does not support python2"
+)
+class DistAutogradTest(object):
     @property
     def world_size(self):
         return 4
 
+    @property
+    def init_method(self):
+        return INIT_METHOD_TEMPLATE.format(
+            file_name=self.file_name, rank=self.rank, world_size=self.world_size
+        )
+
     @dist_init
     def test_autograd_context(self):
         # Verify max possible id.
         max_auto_increment = 281474976710655
-        self.assertEqual(max_auto_increment + (self.worker_id << 48), dist_autograd._get_max_id())
+        self.assertEqual(
+            max_auto_increment + (self.worker_id << 48), dist_autograd._get_max_id()
+        )
 
         context_ids = []
         for i in range(1000):
             with dist_autograd.context() as context_id:
-                self.assertEqual(context_id, dist_autograd._retrieve_context(context_id)._context_id())
+                self.assertEqual(
+                    context_id,
+                    dist_autograd._retrieve_context(context_id)._context_id(),
+                )
                 # First 16 bits should be worker_id.
                 self.assertEqual(self.worker_id, context_id >> 48)
                 context_ids.append(context_id)
 
         for context_id in context_ids:
-            with self.assertRaisesRegex(RuntimeError, 'Could not find autograd context with id: {}'.format(context_id)):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                "Could not find autograd context with id: {}".format(context_id),
+            ):
                 dist_autograd._retrieve_context(context_id)
 
     @dist_init
@@ -48,8 +67,10 @@ def test_autograd_functions(self):
         with dist_autograd.context() as context_id:
             t1 = torch.ones(3, 3, requires_grad=True)
             t2 = torch.zeros(3, 3, requires_grad=True)
-            ret = dist.rpc_sync('worker{}'.format(dst_rank), torch.add, args=(t1, t2))
-            dist.rpc_sync('worker{}'.format(dst_rank), _set_rpc_done, args=(context_id,))
+            ret = dist.rpc_sync("worker{}".format(dst_rank), torch.add, args=(t1, t2))
+            dist.rpc_sync(
+                "worker{}".format(dst_rank), _set_rpc_done, args=(context_id,)
+            )
 
             # Get send function.
             ctx = dist_autograd._current_context()
@@ -62,10 +83,10 @@ def test_autograd_functions(self):
             self.assertEqual(2, len(next_funcs))
 
             # We should now hit t1 and t2 in the autograd graph.
-            self.assertEqual('torch::autograd::AccumulateGrad', next_funcs[0][0].name())
+            self.assertEqual("torch::autograd::AccumulateGrad", next_funcs[0][0].name())
             self.assertEqual(t1, next_funcs[0][0].variable)
             self.assertEqual(0, next_funcs[0][1])
-            self.assertEqual('torch::autograd::AccumulateGrad', next_funcs[1][0].name())
+            self.assertEqual("torch::autograd::AccumulateGrad", next_funcs[1][0].name())
             self.assertEqual(t2, next_funcs[1][0].variable)
             self.assertEqual(0, next_funcs[1][1])
 
@@ -93,13 +114,17 @@ def test_autograd_functions(self):
             next_funcs = list(send_functions.values())[0].next_functions
             self.assertEqual(1, len(next_funcs))
             add_backward_fn = next_funcs[0][0]
-            self.assertEqual('AddBackward0', add_backward_fn.name())
+            self.assertEqual("AddBackward0", add_backward_fn.name())
 
             # Verify the next two functions are the same recv backward function.
             next_funcs = add_backward_fn.next_functions
             self.assertEqual(2, len(next_funcs))
-            self.assertEqual('torch::distributed::autograd::RecvRpcBackward', next_funcs[0][0].name())
-            self.assertEqual('torch::distributed::autograd::RecvRpcBackward', next_funcs[1][0].name())
+            self.assertEqual(
+                "torch::distributed::autograd::RecvRpcBackward", next_funcs[0][0].name()
+            )
+            self.assertEqual(
+                "torch::distributed::autograd::RecvRpcBackward", next_funcs[1][0].name()
+            )
             self.assertEqual(next_funcs[0][0], next_funcs[1][0])
 
         # autograd context should be cleaned up by now.
@@ -118,15 +143,21 @@ def test_rpc_complex_args(self):
             tensors = []
             for i in range(num_tensors):
                 tensors.append(torch.ones(3, 3, requires_grad=(i % 2 == 0)))
-            ret = dist.rpc_sync('worker{}'.format(dst_rank), torch.stack, args=(tensors,))
+            ret = dist.rpc_sync(
+                "worker{}".format(dst_rank), torch.stack, args=(tensors,)
+            )
             self.assertEqual(torch.stack(tensors), ret)
 
             # Verify appropriate tensors have been attached the autograd graph.
-            next_funcs = list(dist_autograd._current_context()._send_functions().values())[0].next_functions
+            next_funcs = list(
+                dist_autograd._current_context()._send_functions().values()
+            )[0].next_functions
             idx = 0
             for i in range(num_tensors):
                 if i % 2 == 0:
-                    self.assertEqual('torch::autograd::AccumulateGrad', next_funcs[i][0].name())
+                    self.assertEqual(
+                        "torch::autograd::AccumulateGrad", next_funcs[i][0].name()
+                    )
                     self.assertEqual(tensors[i], next_funcs[i][0].variable)
                 else:
                     self.assertIsNone(next_funcs[i][0])
diff --git a/test/dist_utils.py b/test/dist_utils.py
index c37c014f365fa..fcdfdefa22343 100644
--- a/test/dist_utils.py
+++ b/test/dist_utils.py
@@ -1,36 +1,49 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-from os import getenv
 from functools import wraps
+from os import getenv
+
 import torch.distributed as dist
 from torch.distributed.rpc_api import RpcBackend
 
+
 if not dist.is_available():
     print("c10d not available, skipping tests")
     sys.exit(0)
 
 
-BACKEND = getenv('RPC_BACKEND', RpcBackend.PROCESS_GROUP)
-RPC_INIT_URL = getenv('RPC_INIT_URL', '')
+class TestConfig:
+    __slots__ = ['backend']
+
+    def __init__(self, *args, **kwargs):
+        assert len(args) == 0, "TestConfig only takes kwargs."
+        for k, v in kwargs.items():
+            setattr(self, k, v)
 
-def dist_init(func):
+
+TEST_CONFIG = TestConfig(backend=getenv("RPC_BACKEND", RpcBackend.PROCESS_GROUP))
+INIT_METHOD_TEMPLATE = "file://{file_name}?rank={rank}&world_size={world_size}"
+
+
+def dist_init(test_method):
     """
     We use this decorator for setting up and tearing down state since
     MultiProcessTestCase runs each `test*` method in a separate process and
     each process just runs the `test*` method without actually calling
     'setUp' and 'tearDown' methods of unittest.
     """
-    @wraps(func)
-    def wrapper(self):
+
+    @wraps(test_method)
+    def wrapper(self, *arg, **kwargs):
         self.worker_id = self.rank
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(backend='gloo', rank=self.rank,
-                                world_size=self.world_size, store=store)
-        dist.init_model_parallel(self_name='worker%d' % self.rank,
-                                 backend=BACKEND,
-                                 self_rank=self.rank,
-                                 init_method=RPC_INIT_URL)
-        func(self)
+        dist.init_process_group(backend="gloo", init_method=self.init_method)
+        dist.init_model_parallel(
+            self_name="worker%d" % self.rank,
+            backend=TEST_CONFIG.backend,
+            self_rank=self.rank,
+            init_method=self.init_method,
+        )
+        test_method(self, *arg, **kwargs)
         dist.join_rpc()
 
     return wrapper
diff --git a/test/rpc_test.py b/test/rpc_test.py
index 081e644a2616a..c5ccb4eb94ac0 100644
--- a/test/rpc_test.py
+++ b/test/rpc_test.py
@@ -4,32 +4,28 @@
 import concurrent.futures
 import sys
 import unittest
-from os import getenv
+from collections import namedtuple
 from unittest import mock
 
 import torch
 import torch.distributed as dist
 import torch.distributed.rpc_backend_registry as rpc_backend_registry
-from collections import namedtuple
 from common_utils import load_tests
-from dist_utils import dist_init
+from dist_utils import INIT_METHOD_TEMPLATE, TEST_CONFIG, dist_init
 from torch.distributed import ProcessGroupAgent
-from torch.distributed.internal_rpc_utils import _internal_rpc_pickler, PythonUDF
+from torch.distributed.internal_rpc_utils import PythonUDF, _internal_rpc_pickler
 from torch.distributed.rpc_api import RpcBackend
 
 
 def requires_process_group_agent(func):
     from torch.distributed.rpc_api import _agent
+
     return unittest.skipUnless(
         isinstance(_agent, ProcessGroupAgent),
         "Only ProcessGroupAgent supports global termination detection",
     )
 
 
-BACKEND = getenv("RPC_BACKEND", RpcBackend.PROCESS_GROUP)
-RPC_INIT_URL = getenv("RPC_INIT_URL", "")
-
-
 VALUE_FUTURE = concurrent.futures.Future()
 
 
@@ -53,9 +49,8 @@ def __init__(self):
 
     def __getstate__(self):
         (pickled_python_udf, tensors) = _internal_rpc_pickler.serialize(
-            PythonUDF(my_tensor_function,
-                      (torch.ones(2, 2), torch.ones(2, 2)),
-                      None))
+            PythonUDF(my_tensor_function, (torch.ones(2, 2), torch.ones(2, 2)), None)
+        )
         return (pickled_python_udf, tensors)
 
     def __setstate__(self, obj):
@@ -92,7 +87,7 @@ def build_complex_tensors():
     b = [a, a]
     c = [b, b]
     d = [a, b]
-    e = {a : d}
+    e = {a: d}
     return [a, b, c, d, e]
 
 
@@ -125,6 +120,7 @@ def no_result():
 def nested_rpc(dst):
     return dist.rpc_sync(dst, torch.add, args=(torch.ones(2, 2), 1))
 
+
 def multi_layer_nested_async_rpc(dst, world_size, ttl):
     # this method returns immediately without blocking the callee, but will
     # generate additional requests.
@@ -134,18 +130,15 @@ def multi_layer_nested_async_rpc(dst, world_size, ttl):
         dist.rpc_async(
             current_dst,
             multi_layer_nested_async_rpc,
-            args=(
-                next_dst,
-                world_size,
-                ttl - 1
-            )
+            args=(next_dst, world_size, ttl - 1),
         )
         return 0
 
+
 def nested_rref(dst):
     return (
         dist.remote(dst, torch.add, args=(torch.ones(2, 2), 1)),
-        dist.remote(dst, torch.add, args=(torch.ones(2, 2), 2))
+        dist.remote(dst, torch.add, args=(torch.ones(2, 2), 2)),
     )
 
 
@@ -159,14 +152,7 @@ def rref_forward_chain(dst, world_size, rref, ttl):
         current_dst = "worker{}".format(dst)
         next_dst = (dst + 1) % world_size
         ret_rref = dist.remote(
-            current_dst,
-            rref_forward_chain,
-            args=(
-                next_dst,
-                world_size,
-                rref,
-                ttl - 1
-            )
+            current_dst, rref_forward_chain, args=(next_dst, world_size, rref, ttl - 1)
         )
         return [ret_rref]
     else:
@@ -206,6 +192,12 @@ class RpcTest(object):
     def world_size(self):
         return 4
 
+    @property
+    def init_method(self):
+        return INIT_METHOD_TEMPLATE.format(
+            file_name=self.file_name, rank=self.rank, world_size=self.world_size
+        )
+
     @dist_init
     def test_worker_id(self):
         n = self.rank + 1
@@ -227,20 +219,12 @@ def test_self_add(self):
         with self.assertRaisesRegex(
             RuntimeError, "does not support making RPC calls to self"
         ):
-            dist.rpc_sync(
-                self_worker_info,
-                torch.add,
-                args=(torch.ones(2, 2), 1)
-            )
+            dist.rpc_sync(self_worker_info, torch.add, args=(torch.ones(2, 2), 1))
 
         with self.assertRaisesRegex(
             RuntimeError, "does not support making RPC calls to self"
         ):
-            dist.rpc_sync(
-                self_worker_name,
-                torch.add,
-                args=(torch.ones(2, 2), 1)
-            )
+            dist.rpc_sync(self_worker_name, torch.add, args=(torch.ones(2, 2), 1))
 
     @mock.patch.object(torch.distributed.autograd, "_init")
     @mock.patch.object(torch.distributed.rpc_api, "_init_rref_context")
@@ -254,40 +238,34 @@ def test_register_rpc_backend_and_init_rpc_backend(
         dist.init_model_parallel(self_name="worker1", backend=backend_name, self_rank=1)
 
     @unittest.skipIf(
-        BACKEND != RpcBackend.PROCESS_GROUP,
+        TEST_CONFIG.backend != RpcBackend.PROCESS_GROUP,
         "PROCESS_GROUP rpc backend specific test, skip",
     )
     def test_duplicate_name(self):
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(
-            backend="gloo", rank=self.rank, world_size=self.world_size, store=store
-        )
+        dist.init_process_group(backend="gloo", init_method=self.init_method)
         with self.assertRaisesRegex(RuntimeError, "is not unique"):
             dist.init_model_parallel(
                 self_name="duplicate_name",
-                backend=BACKEND,
+                backend=TEST_CONFIG.backend,
                 self_rank=self.rank,
-                init_method=RPC_INIT_URL,
+                init_method=self.init_method,
             )
         dist.join_rpc()
 
     def test_reinit(self):
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(
-            backend="gloo", rank=self.rank, world_size=self.world_size, store=store
-        )
+        dist.init_process_group(backend="gloo", init_method=self.init_method)
         dist.init_model_parallel(
             self_name="worker{}".format(self.rank),
-            backend=BACKEND,
+            backend=TEST_CONFIG.backend,
             self_rank=self.rank,
-            init_method=RPC_INIT_URL,
+            init_method=self.init_method,
         )
         with self.assertRaisesRegex(RuntimeError, "is already initialized"):
             dist.init_model_parallel(
                 self_name="worker{}".format(self.rank),
-                backend=BACKEND,
+                backend=TEST_CONFIG.backend,
                 self_rank=self.rank,
-                init_method=RPC_INIT_URL,
+                init_method=self.init_method,
             )
         dist.join_rpc()
 
@@ -297,15 +275,12 @@ def test_init_invalid_backend(self):
                 self_name="worker{}".format(self.rank),
                 backend="invalid",
                 self_rank=self.rank,
-                init_method=RPC_INIT_URL,
+                init_method=self.init_method,
             )
 
     @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/25912")
     def test_invalid_names(self):
-        store = dist.FileStore(self.file_name, self.world_size)
-        dist.init_process_group(
-            backend="gloo", rank=self.rank, world_size=self.world_size, store=store
-        )
+        dist.init_process_group(backend="gloo", init_method=self.init_method)
 
         with self.assertRaisesRegex(RuntimeError, "Worker name must match"):
             dist.init_model_parallel(self_name="abc*")
@@ -321,9 +296,9 @@ def test_invalid_names(self):
         with self.assertRaisesRegex(RuntimeError, "shorter than 128"):
             dist.init_model_parallel(
                 self_name="".join(["a" for _ in range(500)]),
-                backend=BACKEND,
+                backend=TEST_CONFIG.backend,
                 self_rank=self.rank,
-                init_method=RPC_INIT_URL,
+                init_method=self.init_method,
             )
         dist.join_rpc()
 
@@ -496,16 +471,8 @@ def test_py_multi_async_call(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         dst_worker_info = dist.get_worker_info("worker{}".format(dst_rank))
-        fut1 = dist.rpc_async(
-            dst_worker_info,
-            MyClass.my_static_method,
-            args=(n + 10,)
-        )
-        fut2 = dist.rpc_async(
-            dst_worker_info,
-            min,
-            args=(n, n + 1, n + 2)
-        )
+        fut1 = dist.rpc_async(dst_worker_info, MyClass.my_static_method, args=(n + 10,))
+        fut2 = dist.rpc_async(dst_worker_info, min, args=(n, n + 1, n + 2))
         self.assertEqual(fut1.wait(), MyClass.my_static_method(n + 10))
         self.assertEqual(fut2.wait(), min(n, n + 1, n + 2))
 
@@ -523,11 +490,9 @@ def test_py_tensors(self):
         ret = dist.rpc_sync(
             "worker{}".format(dst_rank),
             my_tensor_function,
-            args=(torch.ones(n, n), torch.ones(n, n))
+            args=(torch.ones(n, n), torch.ones(n, n)),
         )
-        self.assertEqual(ret,
-                         my_tensor_function(torch.ones(n, n),
-                                            torch.ones(n, n)))
+        self.assertEqual(ret, my_tensor_function(torch.ones(n, n), torch.ones(n, n)))
 
     @dist_init
     def test_py_tensors_multi_async_call(self):
@@ -538,15 +503,15 @@ def test_py_tensors_multi_async_call(self):
             fut = dist.rpc_async(
                 "worker{}".format(dst_rank),
                 my_tensor_function,
-                args=(torch.ones(i, i), torch.ones(i, i))
+                args=(torch.ones(i, i), torch.ones(i, i)),
             )
             futs.append(fut)
 
         j = 0
         for fut in futs:
-            self.assertEqual(fut.wait(),
-                             my_tensor_function(torch.ones(j, j),
-                                                torch.ones(j, j)))
+            self.assertEqual(
+                fut.wait(), my_tensor_function(torch.ones(j, j), torch.ones(j, j))
+            )
             j += 1
 
     @dist_init
@@ -557,9 +522,7 @@ def test_py_tensors_in_container(self):
         b = TensorClass(build_complex_tensors())
         c = {"foo": torch.ones(n, n), "bar": torch.ones(n, n)}
         ret = dist.rpc_sync(
-            "worker{}".format(dst_rank),
-            my_complex_tensor_function,
-            args=(a, b, c)
+            "worker{}".format(dst_rank), my_complex_tensor_function, args=(a, b, c)
         )
         self.assertEqual(ret, my_complex_tensor_function(a, b, c))
 
@@ -571,7 +534,7 @@ def test_py_nested_pickle(self):
         ret = dist.rpc_sync(
             "worker{}".format(dst_rank),
             run_nested_pickle,
-            args=(MyPickleClass(), torch.ones(2, 2))
+            args=(MyPickleClass(), torch.ones(2, 2)),
         )
 
         m = MyPickleClass()
@@ -583,11 +546,7 @@ def test_py_function_exception(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         with self.assertRaisesRegex(Exception, "TypeError"):
-            ret = dist.rpc_sync(
-                "worker{}".format(dst_rank),
-                no_result,
-                args=(10,)
-            )
+            ret = dist.rpc_sync("worker{}".format(dst_rank), no_result, args=(10,))
 
     @dist_init
     def test_py_raise_in_user_func(self):
@@ -647,9 +606,7 @@ def test_builtin_remote_ret(self):
         )
         self.assertEqual(rref.to_here(), torch.ones(n, n) * 2)
 
-    def _test_multi_remote_call(
-        self, fn, args_fn=lambda x: (), kwargs_fn=lambda x: {}
-    ):
+    def _test_multi_remote_call(self, fn, args_fn=lambda x: (), kwargs_fn=lambda x: {}):
         m = 10
         n = self.rank + 1
         dst_rank = n % self.world_size
@@ -657,12 +614,14 @@ def _test_multi_remote_call(
         expected = []
         for i in range(m):
             n = n + i
-            rrefs.append(dist.remote(
-                'worker{}'.format(dst_rank),
-                fn,
-                args=args_fn(n),
-                kwargs=kwargs_fn(n)
-            ))
+            rrefs.append(
+                dist.remote(
+                    "worker{}".format(dst_rank),
+                    fn,
+                    args=args_fn(n),
+                    kwargs=kwargs_fn(n),
+                )
+            )
             expected.append(fn(*args_fn(n), **kwargs_fn(n)))
 
         for i in range(m):
@@ -673,6 +632,7 @@ def _test_multi_remote_call(
     def test_multi_builtin_remote_ret(self):
         def args_fn(n):
             return (torch.ones(n, n), torch.ones(n, n))
+
         self._test_multi_remote_call(torch.add, args_fn=args_fn)
 
     @dist_init
@@ -691,11 +651,8 @@ def test_py_udf_remote(self):
     @requires_process_group_agent
     def test_multi_py_udf_remote(self):
         def kwargs_fn(n):
-            return {
-                "a" : torch.ones(n, n),
-                "b" : torch.ones(n, n),
-                "c" : torch.ones(n, n),
-            }
+            return {"a": torch.ones(n, n), "b": torch.ones(n, n), "c": torch.ones(n, n)}
+
         self._test_multi_remote_call(my_function, kwargs_fn=kwargs_fn)
 
     @dist_init
@@ -704,19 +661,13 @@ def test_py_rref_args(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         rref_a = dist.remote(
-            'worker{}'.format(dst_rank),
-            torch.add,
-            args=(torch.ones(n, n), 2)
+            "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), 2)
         )
         rref_b = dist.remote(
-            'worker{}'.format(dst_rank),
-            torch.add,
-            args=(torch.ones(n, n), 1)
+            "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), 1)
         )
         rref_c = dist.remote(
-            'worker{}'.format(dst_rank),
-            my_rref_function,
-            args=(rref_a, rref_b)
+            "worker{}".format(dst_rank), my_rref_function, args=(rref_a, rref_b)
         )
         self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
 
@@ -727,43 +678,30 @@ def test_py_rref_args_user_share(self):
         owner_rank = n % self.world_size
         user_rank = (n + 1) % self.world_size
         rref_a = dist.remote(
-            'worker{}'.format(owner_rank),
-            my_function,
-            args=(torch.ones(n, n), 2, 0)
+            "worker{}".format(owner_rank), my_function, args=(torch.ones(n, n), 2, 0)
         )
         rref_b = dist.remote(
-            'worker{}'.format(owner_rank),
-            my_function,
-            args=(torch.ones(n, n), 1, 0)
+            "worker{}".format(owner_rank), my_function, args=(torch.ones(n, n), 1, 0)
         )
         rref_c = dist.remote(
-            'worker{}'.format(user_rank),
-            my_rref_function,
-            args=(rref_a, rref_b)
+            "worker{}".format(user_rank), my_rref_function, args=(rref_a, rref_b)
         )
         self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)
 
-
     @dist_init
     @requires_process_group_agent
     def test_py_rpc_rref_args(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         rref_a = dist.remote(
-            "worker{}".format(dst_rank),
-            my_function,
-            args=(torch.ones(n, n), 2, 0)
+            "worker{}".format(dst_rank), my_function, args=(torch.ones(n, n), 2, 0)
         )
         rref_b = dist.remote(
-            "worker{}".format(dst_rank),
-            my_function,
-            args=(torch.ones(n, n), 1, 0)
+            "worker{}".format(dst_rank), my_function, args=(torch.ones(n, n), 1, 0)
         )
 
         c = dist.rpc_sync(
-            "worker{}".format(dst_rank),
-            my_rref_function,
-            args=(rref_a, rref_b)
+            "worker{}".format(dst_rank), my_rref_function, args=(rref_a, rref_b)
         )
 
         self.assertEqual(c, torch.ones(n, n) + 4)
@@ -777,7 +715,7 @@ def test_nested_remote(self):
         rref = dist.remote(
             "worker{}".format(dst_rank1),
             nested_remote,
-            args=("worker{}".format(dst_rank2),)
+            args=("worker{}".format(dst_rank2),),
         )
         self.assertEqual(rref.to_here(), torch.ones(2, 2) + 3)
 
@@ -790,7 +728,7 @@ def test_nested_rref(self):
         rref_of_rrefs = dist.remote(
             "worker{}".format(dst_rank1),
             nested_rref,
-            args=("worker{}".format(dst_rank2),)
+            args=("worker{}".format(dst_rank2),),
         )
         rrefs = rref_of_rrefs.to_here()
         self.assertEqual(len(rrefs), 2)
@@ -809,7 +747,7 @@ def test_nested_rref_stress(self):
                 dist.remote(
                     "worker{}".format(dst_rank1),
                     nested_rref,
-                    args=("worker{}".format(dst_rank2),)
+                    args=("worker{}".format(dst_rank2),),
                 )
             )
 
@@ -838,10 +776,7 @@ def test_multi_layer_nested_async_rpc(self):
     def test_remote_with_exception(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
-        rref = dist.remote(
-            "worker{}".format(dst_rank),
-            raise_func
-        )
+        rref = dist.remote("worker{}".format(dst_rank), raise_func)
         with self.assertRaisesRegex(Exception, "ValueError"):
             rref.to_here()
 
@@ -854,7 +789,7 @@ def test_rpc_return_rref(self):
         rref = dist.rpc_sync(
             "worker{}".format(dst_rank1),
             rpc_return_rref,
-            args=("worker{}".format(dst_rank2),)
+            args=("worker{}".format(dst_rank2),),
         )
         self.assertEqual(rref.to_here(), torch.ones(2, 2) + 1)
 
@@ -866,9 +801,7 @@ def test_rref_forward_chain(self):
         dst_rank = n % self.world_size
 
         rref = dist.remote(
-            "worker{}".format(dst_rank),
-            torch.add,
-            args=(torch.ones(n, n), 1)
+            "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), 1)
         )
 
         ret_rref = rref_forward_chain(dst_rank, self.world_size, rref, ttl)
@@ -886,18 +819,12 @@ def test_remote_same_worker(self):
         n = self.rank + 1
         dst_rank = n % self.world_size
         rref_a = dist.remote(
-            "worker{}".format(dst_rank),
-            torch.add,
-            args=(torch.ones(n, n), 2)
+            "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), 2)
         )
         rref_b = dist.remote(
-            "worker{}".format(dst_rank),
-            torch.add,
-            args=(torch.ones(n, n), 1)
+            "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), 1)
         )
         rref_c = dist.remote(
-            "worker{}".format(dst_rank),
-            my_rref_function,
-            args=(rref_a, rref_b)
+            "worker{}".format(dst_rank), my_rref_function, args=(rref_a, rref_b)
         )
         self.assertEqual(rref_c.to_here(), torch.ones(n, n) + 4)

From 19ab5381c3fa4e2205a53febf94f616143467401 Mon Sep 17 00:00:00 2001
From: Martin Yuan <martin.yuan@oculus.com>
Date: Fri, 4 Oct 2019 09:34:21 -0700
Subject: [PATCH 059/113] Add OPN instruction and vararg operator table
 (#27104)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27104

* The use case here is to replace prim::ListConstruct, which requires Node, but Node is not available in mobile lite interpreter.
* (OPN, X, N), X is the index to the vararg operator-name and operator tables. N is number of inputs. For ListConstruct example, operator name can be "aten::listconstruct" and the overloaded name is the output type ("int", "float", "bool", "tensor" and "generic").
* A vararg operator table is built with void(int input_size, Stack& stack) functions.
## Unit test
LiteInterpreterConv covers OPN instruction and conv operator.

Test Plan: Imported from OSS

Differential Revision: D17762853

fbshipit-source-id: 475aa0c6678e3760cec805862a78510913a89c83
---
 test/cpp/jit/test_lite_interpreter.cpp        | 29 ++++++++-
 test/cpp/jit/tests.h                          |  3 +-
 torch/csrc/jit/export.cpp                     | 33 +++++++++-
 torch/csrc/jit/instruction.cpp                |  2 +-
 torch/csrc/jit/instruction.h                  |  1 +
 torch/csrc/jit/interpreter.cpp                | 18 +++---
 torch/csrc/jit/interpreter.h                  |  2 +-
 torch/csrc/jit/mobile/function.cpp            | 61 +++++++++++++++++--
 torch/csrc/jit/mobile/function.h              |  8 ++-
 torch/csrc/jit/mobile/import.cpp              | 21 ++++---
 torch/csrc/jit/mobile/interpreter.cpp         | 39 +++++++-----
 torch/csrc/jit/mobile/interpreter.h           |  3 +-
 torch/csrc/jit/mobile/register_mobile_ops.cpp | 36 +++++++++++
 13 files changed, 212 insertions(+), 44 deletions(-)

diff --git a/test/cpp/jit/test_lite_interpreter.cpp b/test/cpp/jit/test_lite_interpreter.cpp
index 9f15573cde048..30bce094ca2b8 100644
--- a/test/cpp/jit/test_lite_interpreter.cpp
+++ b/test/cpp/jit/test_lite_interpreter.cpp
@@ -9,7 +9,7 @@
 namespace torch {
 namespace jit {
 
-void testLiteInterpreter() {
+void testLiteInterpreterAdd() {
   script::Module m("m");
   m.register_parameter("foo", torch::ones({}), false);
   // TODO: support default param val, which was pushed in
@@ -43,5 +43,32 @@ void testLiteInterpreter() {
   AT_ASSERT(resd == refd);
 }
 
+void testLiteInterpreterConv() {
+  std::vector<torch::jit::IValue> inputs;
+
+  script::Module m("m");
+  m.register_parameter("weight", torch::ones({20, 1, 5, 5}), false);
+  m.register_parameter("bias", torch::ones({20}), false);
+  m.define(R"(
+    def forward(self, input):
+      return torch._convolution(input, self.weight, self.bias, [1, 1], [0, 0], [1, 1], False, [0, 0], 1, False, False, True)
+  )");
+
+  inputs.push_back(torch::ones({1, 1, 28, 28}));
+
+  auto outputref = m.forward(inputs).toTensor();
+
+  std::stringstream ss;
+  m._save_for_mobile(ss);
+  mobile::Module bc = _load_for_mobile(ss);
+  IValue res;
+  for (int i = 0; i < 3; ++i) {
+    auto bcinputs = inputs;
+    res = bc.run_method("forward", bcinputs);
+  }
+  auto output = res.toTensor();
+  AT_ASSERT(outputref.dim() == output.dim());
+  AT_ASSERT(outputref[0][0][0][0].item<int>() == output[0][0][0][0].item<int>());
+}
 } // namespace torch
 } // namespace jit
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
index 174e8bb115b93..5efb371a6b44e 100644
--- a/test/cpp/jit/tests.h
+++ b/test/cpp/jit/tests.h
@@ -65,7 +65,8 @@ namespace jit {
   _(ImportTooNew)                      \
   _(ClassDerive)                       \
   _(Inliner)                           \
-  _(LiteInterpreter)
+  _(LiteInterpreterAdd)                \
+  _(LiteInterpreterConv)
 
 #define TH_FORALL_TESTS_CUDA(_) \
   _(ArgumentSpec)               \
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index bd2d1151ec872..1757c42b06a41 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -668,12 +668,41 @@ class ScriptModuleSerializer {
     for (const auto& method : methods) {
       const auto& func = method.function();
       torch::jit::Code code(func.graph());
+      // Make a copy of opnames. Some of them may be changed for mobile later.
+      std::vector<c10::OperatorName> opnames;
+      for (size_t i = 0; i < code.instructions().size(); ++i) {
+        Instruction ins = code.instructions()[i];
+        if (ins.op == OP) {
+          auto node = code.instructions_source()[i];
+          opnames.emplace_back(node->schema().operator_name());
+        }
+      }
 
       // instructions
       std::vector<IValue> inss;
-      for (const auto& ins : code.instructions()) {
+      for (size_t i = 0; i < code.instructions().size(); ++i) {
+        Instruction ins = code.instructions()[i];
         TORCH_CHECK(isOpSupportedInMobile(ins.op), toString(ins.op),
                     " is not supported in mobile module.");
+        if (ins.op == OP) {
+          if (opnames[ins.X].name == "prim::ListConstruct") {
+            auto node = code.instructions_source()[i];
+            ins.op = OPN;
+            ins.N = node->inputs().size();
+            ListTypePtr lt = node->output()->type()->expect<ListType>();
+            if (lt->getElementType() == IntType::get()) {
+              opnames[ins.X].overload_name = "int";
+            } else if (lt->getElementType() == FloatType::get()) {
+              opnames[ins.X].overload_name = "float";
+            } else if (lt->getElementType() == BoolType::get()) {
+              opnames[ins.X].overload_name = "bool";
+            } else if (lt->getElementType()->isSubtypeOf(TensorType::get())) {
+              opnames[ins.X].overload_name = "Tensor";
+            } else {
+              opnames[ins.X].overload_name = "generic";
+            }
+          }
+        }
         std::vector<IValue> insv{toString(ins.op), ins.X, ins.N};
         inss.emplace_back(c10::ivalue::Tuple::create(std::move(insv)));
       }
@@ -682,7 +711,7 @@ class ScriptModuleSerializer {
 
       // operators
       std::vector<IValue> opss;
-      for (const auto& opname : code.opname_table()) {
+      for (const auto& opname : opnames) {
         opss.emplace_back(c10::ivalue::Tuple::create({opname.name, opname.overload_name}));
       }
       auto operators = c10::ivalue::Tuple::create(std::move(opss));
diff --git a/torch/csrc/jit/instruction.cpp b/torch/csrc/jit/instruction.cpp
index e3134d96b66d5..4dc0c5e959097 100644
--- a/torch/csrc/jit/instruction.cpp
+++ b/torch/csrc/jit/instruction.cpp
@@ -70,7 +70,7 @@ OpCode parseOpCode(const char *str) {
 
 bool isOpSupportedInMobile(OpCode op) {
   static constexpr OpCode supported_ops_in_mobile[] {
-      OP, LOAD, MOVE, STOREN, STORE, DROP, DROPR, LOADC, JF, LOOP, RET, GET_ATTR, SET_ATTR
+      OP, OPN, LOAD, MOVE, STOREN, STORE, DROP, DROPR, LOADC, JF, LOOP, RET, GET_ATTR, SET_ATTR
   };
 
   for (auto sop : supported_ops_in_mobile) {
diff --git a/torch/csrc/jit/instruction.h b/torch/csrc/jit/instruction.h
index e16c434a323c9..0bc49b4bed7ab 100644
--- a/torch/csrc/jit/instruction.h
+++ b/torch/csrc/jit/instruction.h
@@ -19,6 +19,7 @@ namespace jit {
 
 #define FORALL_OPCODES(_)                                                   \
   _(OP, "O") /* invoke operator X */                                        \
+  _(OPN, "OI") /* invoke vararg operator X with N arguments */              \
   _(LOAD, "R") /* push a value from a register X */                         \
   _(MOVE, "R") /* push a value from register X, clearing the register */    \
   _(STOREN, "RI") /* store N values to registers [X, X+N) */                \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index 5bee911846d27..ba8cde05e6ec9 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -339,8 +339,6 @@ struct CodeImpl {
 
   std::vector<IValue> constant_table_;
   std::vector<Operation> operator_table_;
-  // opname_table_ has the same order as operator_table_.
-  std::vector<c10::OperatorName> opname_table_;
   std::vector<Function*> function_table_;
   std::vector<TypePtr> type_table_;
   int register_size_ = 0;
@@ -400,8 +398,8 @@ struct CodeImpl {
     return instructions_;
   }
 
-  const std::vector<c10::OperatorName>& opname_table() const {
-    return opname_table_;
+  const std::vector<Node*>& instructions_source() const {
+    return instructions_source_;
   }
 
   void insertInstruction(OpCode op, int64_t X = 0, uint64_t N = 0) {
@@ -491,7 +489,6 @@ struct CodeImpl {
     emitLoadInputs(node->inputs());
     insertInstruction(OP, operator_table_.size());
     operator_table_.emplace_back(getOperation(node));
-    opname_table_.emplace_back(node->schema().operator_name());
   }
 
   void emitWait(Node* node) {
@@ -848,14 +845,17 @@ struct InterpreterStateImpl : c10::intrusive_ptr_target {
     ActiveFrame af(frames.back());
     try {
       while (true) {
-        // std::cout << "RUNNING ";
-        // frames.back().function->dump(std::cout, af.pc);
+//         std::cout << "RUNNING ";
+//         frames.back().function->dump(std::cout, af.pc);
         Instruction inst = af.instructions[af.pc];
         switch (inst.op) {
           case OP:
             af.operators[inst.X](stack);
             ++af.pc;
             break;
+          case OPN:
+            AT_ERROR("OPN is currently supported in mobile mode only.");
+            break;
           case LOAD:
             stack.emplace_back(reg(inst.X));
             ++af.pc;
@@ -1132,8 +1132,8 @@ const std::vector<Instruction>& Code::instructions() const {
   return pImpl->instructions();
 }
 
-const std::vector<c10::OperatorName>& Code::opname_table() const {
-  return pImpl->opname_table();
+const std::vector<Node*>& Code::instructions_source() const {
+  return pImpl->instructions_source();
 }
 
 size_t Code::register_size() const {
diff --git a/torch/csrc/jit/interpreter.h b/torch/csrc/jit/interpreter.h
index cd683939091a5..f11c2d544967c 100644
--- a/torch/csrc/jit/interpreter.h
+++ b/torch/csrc/jit/interpreter.h
@@ -45,7 +45,7 @@ struct TORCH_API Code {
   size_t num_outputs() const;
   const std::vector<c10::IValue>& constant_table() const;
   const std::vector<Instruction>& instructions() const;
-  const std::vector<c10::OperatorName>& opname_table() const;
+  const std::vector<Node*>& instructions_source() const;
   size_t register_size() const;
 
  private:
diff --git a/torch/csrc/jit/mobile/function.cpp b/torch/csrc/jit/mobile/function.cpp
index bc0efa60303f4..5405b87f0f70e 100644
--- a/torch/csrc/jit/mobile/function.cpp
+++ b/torch/csrc/jit/mobile/function.cpp
@@ -1,28 +1,81 @@
 #include "function.h"
 #include "interpreter.h"
+#include <torch/csrc/jit/instruction.h>
+#include <ATen/core/op_registration/op_registration.h>
 
 namespace torch{
 namespace jit{
+
+namespace {
+template <typename dtype> // int64_t, bool, double
+void listConstruct(int num_inputs, Stack& stack) {
+  auto inputs = peekSlice(stack, 0, num_inputs, num_inputs);
+  c10::List<dtype> vals =
+    c10::impl::toList(fmap(inputs, [](const IValue& v) { return v.to<dtype>(); }));
+  drop(stack, num_inputs);
+  push(stack, std::move(vals));
+}
+
+void tensorListConstruct(int num_inputs, Stack& stack) {
+  const size_t stack_size = stack.size();
+  c10::List<at::Tensor> vals;
+  vals.reserve(num_inputs);
+  for (size_t i = stack_size - num_inputs; i < stack_size; ++i) {
+    vals.emplace_back(std::move(stack[i]).toTensor());
+  }
+  drop(stack, num_inputs);
+  push(stack, std::move(vals));
+}
+}
+
+char const * toString(OpCode op);
 namespace mobile {
 Function::Function(c10::QualifiedName name)
     : name_(name), code_(std::make_shared<Code>()) {}
 
-void Function::append_instruction(OpCode op, int N, int X) {
-  code_->instructions_.emplace_back(op, N, X);
+void Function::append_instruction(OpCode op, int X, int N) {
+  TORCH_CHECK(isOpSupportedInMobile(op), toString(op),
+              " is not supported in mobile module.");
+  code_->instructions_.emplace_back(op, X, N);
 }
 
 void Function::append_operator(const std::string& name,
-                             const std::string& overload_name) {
+                               const std::string& overload_name) {
+  // Keep the original opname in code_
   code_->op_names_.emplace_back(name, overload_name);
   auto opname = code_->op_names_.back();
   // Add "_" prefix to work around the double registration both of jit/generated
   // and here. TODO: remove it when we have separate build for lite interpreter.
   opname.name = "_" + opname.name;
   auto op = c10::Dispatcher::singleton().findSchema(opname);
-  assert(op.has_value());
+  TORCH_CHECK(op.has_value(), opname.name, ".", opname.overload_name, " cannot be found.");
   code_->operators_.emplace_back(op);
 }
 
+void Function::build_vararg_operator_table() {
+  for (auto& ins : code_->instructions_) {
+    if (ins.op == OPN) {
+      auto opname = code_->op_names_[ins.X];
+      if (opname.name == "prim::ListConstruct") {
+        if (opname.overload_name == "int") {
+          code_->vararg_operators_.emplace_back(listConstruct<int64_t>);
+        } else if (opname.overload_name == "float") {
+          code_->vararg_operators_.emplace_back(listConstruct<double>);
+        } else if (opname.overload_name == "bool") {
+          code_->vararg_operators_.emplace_back(listConstruct<bool>);
+        } else if (opname.overload_name == "Tensor") {
+          code_->vararg_operators_.emplace_back(tensorListConstruct);
+        } else {
+          AT_ERROR("Type of ListConstruct is not supported.");
+        }
+      } else {
+        AT_ERROR("OPN operator ", opname.name, " is not supported.");
+      }
+      ins.X = code_->vararg_operators_.size() - 1;
+    }
+  }
+}
+
 void Function::append_constant(const c10::IValue& constant) {
   code_->constants_.push_back(constant);
 }
diff --git a/torch/csrc/jit/mobile/function.h b/torch/csrc/jit/mobile/function.h
index e79ce867f3e0e..28de4b453c801 100644
--- a/torch/csrc/jit/mobile/function.h
+++ b/torch/csrc/jit/mobile/function.h
@@ -17,8 +17,12 @@ class Function{
   bool run(Stack& stack) const;
   const std::string& name() const;
   const c10::QualifiedName& qualname() const;
-  void append_instruction(OpCode op, int N, int X);
-  void append_operator(const std::string& name, const std::string& overload_name);
+  void append_instruction(OpCode op, int X, int N);
+  void append_operator(const std::string& name,
+                       const std::string& overload_name);
+  void append_vararg_operator(const std::string& name,
+                              const std::string& overload_name);
+  void build_vararg_operator_table();
   void append_constant(const c10::IValue& constant);
   void set_register_size(size_t size);
 
diff --git a/torch/csrc/jit/mobile/import.cpp b/torch/csrc/jit/mobile/import.cpp
index cfce8e7d4baba..72b484e3a5ffa 100644
--- a/torch/csrc/jit/mobile/import.cpp
+++ b/torch/csrc/jit/mobile/import.cpp
@@ -3,6 +3,7 @@
 #include <torch/csrc/jit/script/compilation_unit.h>
 #include <torch/csrc/jit/unpickler.h>
 #include <caffe2/serialize/inline_container.h>
+#include <torch/csrc/jit/instruction.h>
 
 
 #include <fstream>
@@ -54,20 +55,23 @@ void parseMethods(const std::vector<IValue>& vals, std::shared_ptr<mobile::Compi
     TORCH_CHECK(ins_name == "instructions",
                 "instruction is expected, but get", ins_name);
     auto ins_list = named_ins[1].toTuple()->elements();
+
+    auto named_ops = comps[1].toTuple()->elements();
+    auto ops_name = named_ops[0].toString()->string();
+    TORCH_CHECK(ops_name == "operators",
+                "operator is expected, but get", ops_name);
+    auto ops_list = named_ops[1].toTuple()->elements();
+
     for (const auto& ins : ins_list) {
       auto ins_item = ins.toTuple()->elements();
       TORCH_CHECK(ins_item.size() == 3,
                   "There should be three parts in an instruction.");
       OpCode op_code = parseOpCode(ins_item[0].toString()->string().c_str());
-      function->append_instruction(op_code, ins_item[1].toInt(),
-                                ins_item[2].toInt());
+      int X = ins_item[1].toInt();
+      int N = ins_item[2].toInt();
+      function->append_instruction(op_code, X, N);
     }
 
-    auto named_ops = comps[1].toTuple()->elements();
-    auto ops_name = named_ops[0].toString()->string();
-    TORCH_CHECK(ops_name == "operators",
-                "operator is expected, but get", ops_name);
-    auto ops_list = named_ops[1].toTuple()->elements();
     for (const auto& op : ops_list) {
       auto op_item = op.toTuple()->elements();
       TORCH_CHECK(op_item.size() == 2,
@@ -76,6 +80,9 @@ void parseMethods(const std::vector<IValue>& vals, std::shared_ptr<mobile::Compi
                            op_item[1].toString()->string());
     }
 
+    // vararg operators are stored in a separate table.
+    function->build_vararg_operator_table();
+
     auto named_consts = comps[2].toTuple()->elements();
     auto consts_name = named_consts[0].toString()->string();
     TORCH_CHECK(consts_name == "constants",
diff --git a/torch/csrc/jit/mobile/interpreter.cpp b/torch/csrc/jit/mobile/interpreter.cpp
index bad5c4b62fa91..1cdd33fd0b356 100644
--- a/torch/csrc/jit/mobile/interpreter.cpp
+++ b/torch/csrc/jit/mobile/interpreter.cpp
@@ -5,36 +5,45 @@
 namespace torch{
 namespace jit{
 char const * toString(OpCode op);
+std::ostream& operator<<(std::ostream& out, Instruction inst);
 namespace mobile {
 InterpreterState::InterpreterState(std::shared_ptr<Code> code) : code_(code) {
   registers_.resize(code_->register_size_);
 }
 
-//InterpreterState::InterpreterState(Function* function)
-//    : function_(function) {
-//  registers_.resize(function->register_size());
-//}
+namespace {
+template <typename dtype> // int64_t, bool, double
+void listConstruct(Stack& stack, int num_inputs) {
+  auto inputs = peekSlice(stack, 0, num_inputs, num_inputs);
+  c10::List<dtype> vals =
+      c10::impl::toList(fmap(inputs, [](const IValue& v) { return v.to<dtype>(); }));
+  drop(stack, num_inputs);
+  push(stack, std::move(vals));
+}
+}
 
 bool InterpreterState::run(Stack& stack) {
   size_t pc = 0;
   while (true) {
-    //    std::cout << "RUNNING " << pc << " " << instructions_[pc];
-    //    std::cout << std::endl;
-    //    for (auto val : stack) {
-    //      if (val.isTensor()) {
-    //        std::cout << val.toTensor().sizes() << std::endl;
-    //      } else {
-    //        std::cout << val << std::endl;
-    //      }
-    //    }
+//    std::cout << "RUNNING " << pc << " " << code_->instructions_[pc];
+//    std::cout << std::endl;
+//    for (auto val : stack) {
+//      if (val.isTensor()) {
+//        std::cout << val.toTensor().sizes() << std::endl;
+//      } else {
+//        std::cout << val << std::endl;
+//      }
+//    }
     Instruction inst = code_->instructions_[pc];
-    TORCH_CHECK(isOpSupportedInMobile(inst.op), toString(inst.op),
-                " is not supported in mobile module.");
     switch (inst.op) {
       case OP: {
         c10::Dispatcher::singleton().callBoxed(*code_->operators_[inst.X], &stack);
         ++pc;
       } break;
+      case OPN: {
+        code_->vararg_operators_[inst.X](inst.N, stack);
+        ++pc;
+      } break;
       case LOAD:
         stack.emplace_back(reg(inst.X));
         ++pc;
diff --git a/torch/csrc/jit/mobile/interpreter.h b/torch/csrc/jit/mobile/interpreter.h
index e976ca2d11454..5017e59dc0fa7 100644
--- a/torch/csrc/jit/mobile/interpreter.h
+++ b/torch/csrc/jit/mobile/interpreter.h
@@ -8,11 +8,12 @@ namespace torch{
 namespace jit{
 namespace mobile {
 using Stack = std::vector<c10::IValue>;
-
+using VarargFuncton = std::function<void(int, Stack&)>;
 struct Code {
   std::vector<Instruction> instructions_;
   std::vector<c10::OperatorName> op_names_;
   std::vector<c10::optional<c10::OperatorHandle>> operators_;
+  std::vector<VarargFuncton> vararg_operators_;
   std::vector<c10::IValue> constants_;
   size_t register_size_; // Aggregated output size.
 };
diff --git a/torch/csrc/jit/mobile/register_mobile_ops.cpp b/torch/csrc/jit/mobile/register_mobile_ops.cpp
index 917b07b735e81..5d26a130ebfaf 100644
--- a/torch/csrc/jit/mobile/register_mobile_ops.cpp
+++ b/torch/csrc/jit/mobile/register_mobile_ops.cpp
@@ -23,5 +23,41 @@ static auto registry0 = torch::RegisterOperators().op(
   [](at::Tensor a, at::Scalar b, at::Scalar c) ->at::Tensor {
     return at::add(a, b, c);
   })
+).op(
+  "_aten::_convolution",
+  torch::RegisterOperators::options().kernel(c10::TensorTypeId::CPUTensorId,
+  [](at::Tensor input, at::Tensor weight, c10::optional<at::Tensor> bias,
+  std::vector<int64_t> stride, std::vector<int64_t> padding,
+  std::vector<int64_t> dilation, bool transposed, std::vector<int64_t> output_padding,
+  int64_t groups, bool benchmark, bool deterministic, bool cudnn_enabled) {
+  return at::_convolution(input, weight, optional_to_tensor(bias), stride, padding, dilation,
+                     transposed, output_padding, groups, benchmark, deterministic, cudnn_enabled);
+  })
+).op(
+  // Dummy operator that does nothing. Used to reserve a location of an operator table.
+  "_prim::ListConstruct.int",
+  torch::RegisterOperators::options().catchAllKernel(
+  []() {
+  })
+).op(
+  "_prim::ListConstruct.float",
+  torch::RegisterOperators::options().catchAllKernel(
+  []() {
+  })
+).op(
+  "_prim::ListConstruct.bool",
+  torch::RegisterOperators::options().catchAllKernel(
+  []() {
+  })
+).op(
+  "_prim::ListConstruct.tensor",
+  torch::RegisterOperators::options().catchAllKernel(
+  []() {
+  })
+).op(
+  "_prim::ListConstruct.generic",
+  torch::RegisterOperators::options().catchAllKernel(
+  []() {
+  })
 );
 

From 50b3f9d815efceac58f41041dee4b355fb33b92c Mon Sep 17 00:00:00 2001
From: Igor Fedan <ifedan@fb.com>
Date: Fri, 4 Oct 2019 10:01:50 -0700
Subject: [PATCH 060/113] Allow use cpu_serial_kernel with void-lambda (#27370)

Summary:
https://github.com/pytorch/pytorch/pull/27271
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27370

Differential Revision: D17763265

Pulled By: ifedan

fbshipit-source-id: d670560dfc555db529b18c01aa42f0ccb2127889
---
 aten/src/ATen/native/cpu/IsContiguous.h     | 40 ++++++++++++++----
 aten/src/ATen/native/cpu/Loops.h            | 41 ++++++++++++++-----
 aten/src/ATen/test/tensor_iterator_test.cpp | 45 ++++++++++++++++++++-
 3 files changed, 108 insertions(+), 18 deletions(-)

diff --git a/aten/src/ATen/native/cpu/IsContiguous.h b/aten/src/ATen/native/cpu/IsContiguous.h
index 6392746a8013a..8a24ce0b7f24c 100644
--- a/aten/src/ATen/native/cpu/IsContiguous.h
+++ b/aten/src/ATen/native/cpu/IsContiguous.h
@@ -5,34 +5,58 @@ namespace at { namespace native { namespace {
 // n: number of function arguments (arity)
 // traits: function_traits (see FunctionTraits.h)
 // s: index of scalar argument or -1
-template <int n, typename traits, int s=-1>
+template <int n, int stride_index, typename traits, int s=-1>
 struct IsContiguous {
   static bool eval(const int64_t* strides) {
     using type = typename traits::template arg<n - 1>::type;
-    return strides[n] == (s == n ? 0 : sizeof(type)) &&
-        IsContiguous<n - 1, traits, s>::eval(strides);
+    return strides[stride_index] == (s == n ? 0 : sizeof(type)) &&
+           IsContiguous<n - 1, stride_index - 1, traits, s>::eval(strides);
   }
 };
 
+// will be called when there is an output exists
 template <typename traits, int s>
-struct IsContiguous<0, traits, s> {
+struct IsContiguous<0, 0, traits, s> {
   static bool eval(const int64_t* strides) {
     return strides[0] == sizeof(typename traits::result_type);
   }
 };
 
+// will be called when there is no output
+template <typename traits, int s>
+struct IsContiguous<0, -1, traits, s> {
+  static bool eval(const int64_t* strides) {
+    return true;
+  }
+};
+
 // output and all inputs are contiguous
-template <typename traits>
+template <typename traits,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
 static inline bool is_contiguous(const int64_t* strides) {
-  return IsContiguous<traits::arity, traits>::eval(strides);
+  return IsContiguous<traits::arity, traits::arity - 1, traits>::eval(strides);
+}
+
+template <typename traits,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous(const int64_t* strides) {
+  return IsContiguous<traits::arity, traits::arity, traits>::eval(strides);
 }
 
 // input at `s` is scalar (stride 0); output and other inputs are contiguous
 // NB: output is typically at strides[0] so first input corresponds to s=1
-template <typename traits, int s>
+template <typename traits, int s,
+    typename std::enable_if<std::is_void<typename traits::result_type>::value>::type* = nullptr>
+static inline bool is_contiguous_scalar(const int64_t* strides) {
+  static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
+  return IsContiguous<traits::arity, traits::arity - 1, traits, s>::eval(strides);
+}
+
+template <typename traits, int s,
+    typename std::enable_if<!std::is_void<typename traits::result_type>::value>::type* = nullptr>
 static inline bool is_contiguous_scalar(const int64_t* strides) {
   static_assert(s > 0 && s <= traits::arity, "scalar argument index out of bounds");
-  return IsContiguous<traits::arity, traits, s>::eval(strides);
+  return IsContiguous<traits::arity, traits::arity, traits, s>::eval(strides);
 }
 
 }}}
diff --git a/aten/src/ATen/native/cpu/Loops.h b/aten/src/ATen/native/cpu/Loops.h
index e4bc9bc492b49..5d2ae2020c9c0 100644
--- a/aten/src/ATen/native/cpu/Loops.h
+++ b/aten/src/ATen/native/cpu/Loops.h
@@ -80,13 +80,40 @@ dereference_vec(char* C10_RESTRICT data[], const typename traits::result_type& o
   return dereference_vec_impl<traits>(data, opt_scalar, S, i, Indices{});
 }
 
+template <typename func_t,
+    typename std::enable_if<!std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
+  using traits = function_traits<func_t>;
+  using result_type = typename traits::result_type;
+  for (; i < n; i++) {
+    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
+    *out_ptr = c10::guts::apply(op, dereference<traits>(
+        &data[1],
+        &strides[1],
+        i));
+  }
+}
+
+template <typename func_t,
+    typename std::enable_if<std::is_void<typename function_traits<func_t>::result_type>::value>::type* = nullptr>
+static inline void
+execute_op(char* C10_RESTRICT data[], const int64_t* strides, int64_t i, int64_t n, func_t op) {
+  using traits = function_traits<func_t>;
+  for (; i < n; i++) {
+    c10::guts::apply(op, dereference<traits>(
+        &data[0],
+        &strides[0],
+        i));
+  }
+}
+
 // Basic loop operation (one output, N inputs). May be auto-vectorized
 // by the compiler. Supports inputs and outputs of different types.
 template <typename func_t>
 static inline void
 basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_t n, func_t op) {
   using traits = function_traits<func_t>;
-  using result_type = typename traits::result_type;
   constexpr int ntensors = traits::arity + 1;
 
   // Copying strides to temporary array helps auto vectorization in older GCC
@@ -96,13 +123,7 @@ basic_loop(char* C10_RESTRICT data[], const int64_t* strides_, int64_t i, int64_
     strides[arg] = strides_[arg];
   }
 
-  for (; i < n; i++) {
-    result_type* out_ptr = (result_type*)(data[0] + i * strides[0]);
-    *out_ptr = c10::guts::apply(op, dereference<traits>(
-      &data[1],
-      &strides[1],
-      i));
-  }
+  execute_op(data, strides, i, n, op);
 }
 
 // Explicitly vectorized loop implementation. All inputs and outputs must be
@@ -205,7 +226,8 @@ void cpu_kernel_vec(TensorIterator& iter, func_t op, vec_func_t vop) {
 template <typename func_t>
 void cpu_serial_kernel(TensorIterator& iter, func_t op) {
   using traits = function_traits<func_t>;
-  TORCH_INTERNAL_ASSERT(iter.ntensors() >= traits::arity + 1);
+  TORCH_INTERNAL_ASSERT((std::is_void<typename traits::result_type>::value &&
+    iter.noutputs() == 0 && iter.ntensors() == traits::arity) || (iter.ntensors() >= traits::arity + 1));
 
   iter.serial_for_each([&](char** data, const int64_t* strides, int64_t n) {
     if (is_contiguous<traits>(strides)) {
@@ -217,6 +239,7 @@ void cpu_serial_kernel(TensorIterator& iter, func_t op) {
       });
     }
   }, {0, iter.numel()});
+  iter.cast_outputs();
 }
 
 }}}  // namespace at::native::<anonymous>
diff --git a/aten/src/ATen/test/tensor_iterator_test.cpp b/aten/src/ATen/test/tensor_iterator_test.cpp
index ef4d17a2f8b3f..6939d32ac7686 100644
--- a/aten/src/ATen/test/tensor_iterator_test.cpp
+++ b/aten/src/ATen/test/tensor_iterator_test.cpp
@@ -61,17 +61,41 @@ TEST(TensorIteratorTest, SerialLoopUnary_##name) {                            \
   ASSERT_ANY_THROW(out.equal(expected));                                      \
 }
 
+#define NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE(ctype,name)                         \
+TEST(TensorIteratorTest, SerialLoopUnaryNoOutput_##name) {                     \
+  auto in = random_tensor_for_type(k##name);                                   \
+  auto iter = at::TensorIterator();                                            \
+  iter.add_input(in);                                                          \
+  iter.build();                                                                \
+  int64_t acc = 0;                                                             \
+  at::native::cpu_serial_kernel(iter, [&](ctype a) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in.numel());                                              \
+}
+
 #define BINARY_TEST_ITER_FOR_TYPE(ctype,name)                                          \
 TEST(TensorIteratorTest, SerialLoopBinary_##name) {                                    \
   Tensor out;                                                                          \
   auto in1 = random_tensor_for_type(k##name);                                          \
   auto in2 = random_tensor_for_type(k##name);                                          \
   auto expected = in1.add(in2);                                                        \
-  auto iter = TensorIterator::binary_op(out, in1, in2);                           \
+  auto iter = TensorIterator::binary_op(out, in1, in2);                                \
   at::native::cpu_serial_kernel(iter, [=](ctype a, ctype b) -> int { return a + b; }); \
   ASSERT_ANY_THROW(out.equal(expected));                                               \
 }
 
+#define NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE(ctype,name)                          \
+TEST(TensorIteratorTest, SerialLoopBinaryNoOutput_##name) {                      \
+  auto in1 = random_tensor_for_type(k##name);                                    \
+  auto in2 = random_tensor_for_type(k##name);                                    \
+  auto iter = at::TensorIterator();                                              \
+  iter.add_input(in1);                                                           \
+  iter.add_input(in2);                                                           \
+  iter.build();                                                                  \
+  int64_t acc = 0;                                                               \
+  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in1.numel());                                               \
+}
+
 #define POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                                    \
 TEST(TensorIteratorTest, SerialLoopPointwise_##name) {                                              \
   Tensor out;                                                                                       \
@@ -89,6 +113,21 @@ TEST(TensorIteratorTest, SerialLoopPointwise_##name) {
   ASSERT_ANY_THROW(out.equal(expected));                                                            \
 }
 
+#define NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE(ctype,name)                                \
+TEST(TensorIteratorTest, SerialLoopPoinwiseNoOutput_##name) {                             \
+  auto in1 = random_tensor_for_type(k##name);                                             \
+  auto in2 = random_tensor_for_type(k##name);                                             \
+  auto in3 = random_tensor_for_type(k##name);                                             \
+  auto iter = at::TensorIterator();                                                       \
+  iter.add_input(in1);                                                                    \
+  iter.add_input(in2);                                                                    \
+  iter.add_input(in3);                                                                    \
+  iter.build();                                                                           \
+  int64_t acc = 0;                                                                        \
+  at::native::cpu_serial_kernel(iter, [&](ctype a, ctype b, ctype c) -> void { acc++; }); \
+  EXPECT_TRUE(acc == in1.numel());                                                        \
+}
+
 // The alternative way to calculate a < b is (b - a).clamp(0).toBool()
 // To prevent an overflow in subtraction (b - a) for unsigned types(unit, bool)
 // we will convert in to int first
@@ -112,6 +151,9 @@ TEST(TensorIteratorTest, ComparisonLoopBinary_##name) {
 AT_FORALL_SCALAR_TYPES(UNARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(BINARY_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES(POINTWISE_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_UNARY_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_BINARY_TEST_ITER_FOR_TYPE)
+AT_FORALL_SCALAR_TYPES(NO_OUTPUT_POINTWISE_TEST_ITER_FOR_TYPE)
 AT_FORALL_SCALAR_TYPES_AND(Bool, COMPARISON_TEST_ITER_FOR_TYPE)
 
 TEST(TensorIteratorTest, SerialLoopSingleThread) {
@@ -172,3 +214,4 @@ TEST(TensorIteratorTest, DoNotComputeCommonDTypeIfOutputIsUndefined) {
   iter.compute_common_dtype_only_for_inputs();
   ASSERT_ANY_THROW(iter.build());
 }
+

From 9f9c6c09992a7654640b44461911277d784803e4 Mon Sep 17 00:00:00 2001
From: Pim de Haan <pim@qti.qualcomm.com>
Date: Fri, 4 Oct 2019 10:54:30 -0700
Subject: [PATCH 061/113] From docs of scatter_add_() removed erroneous comment
 on uniqueness of indices. (#27132)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/27080
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27132

Differential Revision: D17765307

Pulled By: soumith

fbshipit-source-id: b0892ff442f3b49f8e3cdf029e2a08b51fa88f28
---
 torch/_tensor_docs.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/torch/_tensor_docs.py b/torch/_tensor_docs.py
index 9d50a4112a049..e58eb37725b3c 100644
--- a/torch/_tensor_docs.py
+++ b/torch/_tensor_docs.py
@@ -2253,10 +2253,6 @@ def callable(a, b) -> number
 dimensions ``d``, and that ``index.size(d) <= self.size(d)`` for all dimensions
 ``d != dim``.
 
-Moreover, as for :meth:`~Tensor.gather`, the values of :attr:`index` must be
-between ``0`` and ``self.size(dim) - 1`` inclusive, and all values in a row along
-the specified dimension :attr:`dim` must be unique.
-
 .. include:: cuda_deterministic.rst
 
 Args:

From a6bb8b52d4cce3ea356354e88ce4d05659bb21de Mon Sep 17 00:00:00 2001
From: davidriazati <davidriazati@fb.com>
Date: Fri, 4 Oct 2019 11:21:37 -0700
Subject: [PATCH 062/113] Reduce error context from 10 -> 3 (#26765)

Summary:
10 lines of error context (on both sides) is overkill, especially now
that we have line numbers. With a compilation stack of a couple
functions, it becomes a pain to scroll to the top of the stack to see
the real error every time.

This also fixes class names in the compilation stack to a format of
`ClassName.method_name` instead of the the full qualified name
Old output
```
clip_boxes_to_image(Tensor boxes, (int, int) size) -> (Tensor):
Expected a value of type 'Tuple[int, int]' for argument 'size' but instead found type 'Tuple[int, int, int]'.
:
at /home/davidriazati/dev/vision/torchvision/models/detection/rpn.py:365:20
        top_n_idx = self._get_top_n_idx(objectness, num_anchors_per_level)
        batch_idx = torch.arange(num_images, device=device)[:, None]
        objectness = objectness[batch_idx, top_n_idx]
        levels = levels[batch_idx, top_n_idx]
        proposals = proposals[batch_idx, top_n_idx]

        final_boxes = []
        final_scores = []
        for boxes, scores, lvl, img_shape in zip(proposals, objectness, levels, image_shapes):
            boxes = box_ops.clip_boxes_to_image(boxes, img_shape)
                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
            keep = box_ops.remove_small_boxes(boxes, self.min_size)
            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
            # non-maximum suppression, independently done per level
            keep = box_ops.batched_nms(boxes, scores, lvl, self.nms_thresh)
            # keep only topk scoring predictions
            keep = keep[:self.post_nms_top_n]
            boxes, scores = boxes[keep], scores[keep]
            final_boxes.append(boxes)
            final_scores.append(scores)
'RegionProposalNetwork.filter_proposals' is being compiled since it was called from 'RegionProposalNetwork.forward'
at /home/davidriazati/dev/vision/torchvision/models/detection/rpn.py:446:8
        num_images = len(anchors)
        num_anchors_per_level = [o[0].numel() for o in objectness]
        objectness, pred_bbox_deltas = \
            concat_box_prediction_layers(objectness, pred_bbox_deltas)
        # apply pred_bbox_deltas to anchors to obtain the decoded proposals
        # note that we detach the deltas because Faster R-CNN do not backprop through
        # the proposals
        proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
        proposals = proposals.view(num_images, -1, 4)
        boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE

        losses = {}
        if self.training:
            assert targets is not None
            labels, matched_gt_boxes = self.assign_targets_to_anchors(anchors, targets)
            regression_targets = self.box_coder.encode(matched_gt_boxes, anchors)
            loss_objectness, loss_rpn_box_reg = self.compute_loss(
                objectness, pred_bbox_deltas, labels, regression_targets)
            losses = {
'RegionProposalNetwork.forward' is being compiled since it was called from 'MaskRCNN.forward'
at /home/davidriazati/dev/vision/torchvision/models/detection/generalized_rcnn.py:53:8
        """
        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")
        original_image_sizes = [(img.shape[-2], img.shape[-3])  for img in images]

        images, targets = self.transform(images, targets)
        features = self.backbone(images.tensors)
        if isinstance(features, torch.Tensor):
            features = OrderedDict([(0, features)])
        proposals, proposal_losses = self.rpn(images, features, targets)
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
        detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
        detections = self.transform.postprocess(detections, images.image_sizes, original_image_sizes)

        losses = {}
        losses.update(detector_losses)
        losses.update(proposal_losses)

        # TODO: multiple return types??
        # if self.training:
```

New output

```
RuntimeError:

clip_boxes_to_image(Tensor boxes, (int, int) size) -> (Tensor):
Expected a value of type 'Tuple[int, int]' for argument 'size' but instead found type 'Tuple[int, int, int]'.
:
at /home/davidriazati/dev/vision/torchvision/models/detection/rpn.py:365:20
        final_scores = []
        for boxes, scores, lvl, img_shape in zip(proposals, objectness, levels, image_shapes):
            boxes = box_ops.clip_boxes_to_image(boxes, img_shape)
                    ~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
            keep = box_ops.remove_small_boxes(boxes, self.min_size)
            boxes, scores, lvl = boxes[keep], scores[keep], lvl[keep]
'RegionProposalNetwork.filter_proposals' is being compiled since it was called from 'RegionProposalNetwork.forward'
at /home/davidriazati/dev/vision/torchvision/models/detection/rpn.py:446:8
        proposals = self.box_coder.decode(pred_bbox_deltas.detach(), anchors)
        proposals = proposals.view(num_images, -1, 4)
        boxes, scores = self.filter_proposals(proposals, objectness, images.image_sizes, num_anchors_per_level)
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE

        losses = {}
'RegionProposalNetwork.forward' is being compiled since it was called from 'MaskRCNN.forward'
at /home/davidriazati/dev/vision/torchvision/models/detection/generalized_rcnn.py:53:8
        if isinstance(features, torch.Tensor):
            features = OrderedDict([(0, features)])
        proposals, proposal_losses = self.rpn(images, features, targets)
        ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
        detections, detector_losses = self.roi_heads(features, proposals, images.image_sizes, targets)
        detections = self.transform.postprocess
```
](https://our.intern.facebook.com/intern/diff/17560963/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26765

Pulled By: driazati

Differential Revision: D17560963

fbshipit-source-id: e463548744b505ca17f0158079b80e08fda47d49
---
 docs/source/jit.rst                |  3 +++
 torch/csrc/jit/script/compiler.cpp | 10 ++++++++--
 torch/csrc/jit/source_range.h      |  2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/docs/source/jit.rst b/docs/source/jit.rst
index beabafd55b324..f962b83c0e8ac 100644
--- a/docs/source/jit.rst
+++ b/docs/source/jit.rst
@@ -449,9 +449,12 @@ Example (a type mismatch)
          if x:
          ~~~~~...  <--- HERE
              r = torch.rand(1)
+         else:
+     and was used here:
          else:
              r = 4
          return r
+                ~ <--- HERE
      ...
 
 
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index c95b689a0297a..4f8031a3e9615 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -3289,8 +3289,14 @@ std::unique_ptr<Function> CompilationUnit::define(
   auto creator = [def, _resolver, self](Function& method) {
     // Store the function name so that it can be referenced if there is an error
     // while compiling this function
-    ErrorReport::CallStack call(
-        self ? method.qualname().qualifiedName() : method.qualname().name());
+    std::string call_name = method.qualname().name();
+    if (self) {
+      auto atoms = method.qualname().atoms();
+      // There should be at least a ClassName.method_name
+      TORCH_INTERNAL_ASSERT(atoms.size() >= 2);
+      call_name = atoms.at(atoms.size() - 2) + "." + atoms.at(atoms.size() - 1);
+    }
+    ErrorReport::CallStack call(call_name);
     to_ir(def, _resolver, self, method);
   };
   auto name = prefix ? QualifiedName(*prefix, def.name().name())
diff --git a/torch/csrc/jit/source_range.h b/torch/csrc/jit/source_range.h
index 5a17bd61a4dab..28ccae3c9a8b5 100644
--- a/torch/csrc/jit/source_range.h
+++ b/torch/csrc/jit/source_range.h
@@ -112,7 +112,7 @@ struct CAFFE2_API SourceRange {
   size_t size() const {
     return end() - start();
   }
-  static const size_t CONTEXT = 10;
+  static const size_t CONTEXT = 3;
   void highlight(std::ostream& out) const;
   const std::shared_ptr<Source>& source() const {
     return source_;

From a33dbccf60a3560d6711d3eda6e06a0a7ef46e91 Mon Sep 17 00:00:00 2001
From: davidriazati <davidriazati@fb.com>
Date: Fri, 4 Oct 2019 12:28:11 -0700
Subject: [PATCH 063/113] Fix some return std::move warnings (#27384)

Summary:
clang-tidy was complaining about these
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27384

Pulled By: driazati

Differential Revision: D17767412

fbshipit-source-id: 03e2630790edf3f6bbf9064e754156613032b464
---
 torch/csrc/jit/import.cpp               | 2 +-
 torch/csrc/jit/passes/bailout_graph.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/import.cpp b/torch/csrc/jit/import.cpp
index 330dc58258f74..5e77c39b8a179 100644
--- a/torch/csrc/jit/import.cpp
+++ b/torch/csrc/jit/import.cpp
@@ -131,7 +131,7 @@ IValue ScriptModuleDeserializer::readArchive(const std::string& archive_name) {
       (*type.type_->getMethod("__setstate__"))({obj, input});
       setGraphExecutorOptimize(true);
       postSetStateValidate(obj);
-      return std::move(obj);
+      return obj;
     } else {
       auto dict = std::move(input).toGenericDict();
       auto obj = c10::ivalue::Object::create(type, n);
diff --git a/torch/csrc/jit/passes/bailout_graph.cpp b/torch/csrc/jit/passes/bailout_graph.cpp
index 28c3543f0fbba..95eb6d5e66fa6 100644
--- a/torch/csrc/jit/passes/bailout_graph.cpp
+++ b/torch/csrc/jit/passes/bailout_graph.cpp
@@ -23,7 +23,7 @@ static std::unordered_set<Value *> collectLoopCounts(Node *n) {
     it = outerNode->owningBlock();
   }
 
-  return std::move(loopCounts);
+  return loopCounts;
 }
 
 struct BailOutGraphBuilderForNode {

From 0be6641fbf2e92565416021e853c2fc43c7554bd Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Fri, 4 Oct 2019 12:48:12 -0700
Subject: [PATCH 064/113] add function to get nccl version for error messages
 (#27068)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27068

Adds a function that uses ncclGetVersion from the NCCL API to retrieve the NCCL version. Converts it into a readable string, and is called in NCCL-related error messages to log the NCCL version. Hopefully this will help with debugging NCCL errors.

Test Plan:
Modify C10D_NCCL_CHECK in NCCLUtils.hpp to always error by setting ncclResult_t error = ncclSystemError
force an NCCL error with script test/simulate_nccl_errors.py:
Start master node: python test/simulate_nccl_errors.py localhost 9124 0 2
Start other node: python test/simulate_nccl_errors.py localhost 9124 1 2
On the master node, should see the following error message w/NCCL version:

```
Traceback (most recent call last):
  File "simulate_nccl_errors.py", line 29, in <module>
    process_group.allreduce(torch.rand(10).cuda(rank)).wait()
RuntimeError: NCCL error in: ../torch/lib/c10d/ProcessGroupNCCL.cpp:375, unhandled system error, NCCL version 2.4.8
```

Differential Revision: D17639476

fbshipit-source-id: a2f558ad9e883b6be173cfe758ec56cf140bc1ee
---
 torch/lib/c10d/CMakeLists.txt       |  2 +-
 torch/lib/c10d/NCCLUtils.cpp        | 31 +++++++++++++++++++++++++++++
 torch/lib/c10d/NCCLUtils.hpp        | 20 ++++++++++---------
 torch/lib/c10d/ProcessGroupNCCL.cpp |  2 +-
 4 files changed, 44 insertions(+), 11 deletions(-)
 create mode 100644 torch/lib/c10d/NCCLUtils.cpp

diff --git a/torch/lib/c10d/CMakeLists.txt b/torch/lib/c10d/CMakeLists.txt
index 49d55105573d3..497bdcb5d29a3 100644
--- a/torch/lib/c10d/CMakeLists.txt
+++ b/torch/lib/c10d/CMakeLists.txt
@@ -56,7 +56,7 @@ set(C10D_SRCS
 set(C10D_LIBS torch)
 
 if(USE_C10D_NCCL)
-  list(APPEND C10D_SRCS ProcessGroupNCCL.cpp)
+  list(APPEND C10D_SRCS ProcessGroupNCCL.cpp NCCLUtils.cpp)
   list(APPEND C10D_LIBS __caffe2_nccl)
 endif()
 
diff --git a/torch/lib/c10d/NCCLUtils.cpp b/torch/lib/c10d/NCCLUtils.cpp
new file mode 100644
index 0000000000000..c00d383ae69d0
--- /dev/null
+++ b/torch/lib/c10d/NCCLUtils.cpp
@@ -0,0 +1,31 @@
+#include <c10d/NCCLUtils.hpp>
+#include <mutex>
+
+namespace c10d {
+
+std::string getNcclVersion() {
+  static std::once_flag ncclGetVersionFlag;
+  static std::string versionString;
+
+  std::call_once(ncclGetVersionFlag, []() {
+    int version;
+    ncclResult_t status = ncclGetVersion(&version);
+    if (status != ncclSuccess) {
+      versionString = "Unknown NCCL version";
+    }
+    auto ncclMajor = version / 1000;
+    auto ncclMinor = (version % 1000) / 100;
+    auto ncclPatch = version % (ncclMajor * 1000 + ncclMinor * 100);
+    versionString = std::to_string(ncclMajor) + "." +
+        std::to_string(ncclMinor) + "." + std::to_string(ncclPatch);
+  });
+
+  return versionString;
+}
+
+std::string ncclGetErrorWithVersion(ncclResult_t error) {
+  return std::string(ncclGetErrorString(error)) + ", NCCL version " +
+      getNcclVersion();
+}
+
+} // namespace c10d
diff --git a/torch/lib/c10d/NCCLUtils.hpp b/torch/lib/c10d/NCCLUtils.hpp
index 4ab1f5757691f..96e44b2f45624 100644
--- a/torch/lib/c10d/NCCLUtils.hpp
+++ b/torch/lib/c10d/NCCLUtils.hpp
@@ -10,19 +10,21 @@
 #include <nccl.h>
 #include <memory>
 
-#define C10D_NCCL_CHECK(cmd)                                              \
-  do {                                                                    \
-    ncclResult_t error = cmd;                                             \
-    if (error != ncclSuccess) {                                           \
-      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" + \
-          std::to_string(__LINE__) + ", " +                               \
-          std::string(ncclGetErrorString(error));                         \
-      throw std::runtime_error(err);                                      \
-    }                                                                     \
+#define C10D_NCCL_CHECK(cmd)                                                \
+  do {                                                                      \
+    ncclResult_t error = cmd;                                               \
+    if (error != ncclSuccess) {                                             \
+      std::string err = "NCCL error in: " + std::string(__FILE__) + ":" +   \
+          std::to_string(__LINE__) + ", " + ncclGetErrorWithVersion(error); \
+      throw std::runtime_error(err);                                        \
+    }                                                                       \
   } while (0)
 
 namespace c10d {
 
+std::string getNcclVersion();
+std::string ncclGetErrorWithVersion(ncclResult_t error);
+
 // RAII wrapper for NCCL communicator
 class NCCLComm {
  public:
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 85ea14c14e5c2..469c39245e70b 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -314,7 +314,7 @@ std::exception_ptr ProcessGroupNCCL::checkForNCCLErrorsInternal(
     ncclResult_t ncclAsyncErr = ncclComm->checkForNcclError();
     if (ncclAsyncErr != ncclSuccess) {
       return std::make_exception_ptr(std::runtime_error(
-          "NCCL error: " + std::string(ncclGetErrorString(ncclAsyncErr))));
+          "NCCL error: " + ncclGetErrorWithVersion(ncclAsyncErr)));
     }
   }
 

From 192ca9730ff8dce30acff8a5e9942583e0b15021 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Fri, 4 Oct 2019 12:51:41 -0700
Subject: [PATCH 065/113] C++ API parity: Hardtanh

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27038

Test Plan: Imported from OSS

Differential Revision: D17682405

Pulled By: pbelevich

fbshipit-source-id: f65e76696e0041c3518f56da94f2e3b800305234
---
 test/cpp/api/functional.cpp                   | 24 ++++++++++
 test/cpp/api/modules.cpp                      | 44 +++++++++++++++++++
 .../include/torch/nn/functional/activation.h  |  8 ++++
 .../api/include/torch/nn/modules/activation.h | 23 ++++++++++
 .../api/include/torch/nn/options/activation.h | 16 +++++++
 torch/csrc/api/src/nn/modules/activation.cpp  | 26 +++++++++++
 6 files changed, 141 insertions(+)

diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 288e7d7ab4c81..37b1cab932457 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -303,3 +303,27 @@ TEST_F(FunctionalTest, OneHot) {
     ASSERT_EQ(y.sizes(), torch::IntArrayRef({3, 2, 3}));
   }
 }
+
+TEST_F(FunctionalTest, Hardtanh) {
+  const auto size = 3;
+  for (const auto min_val : {-4.2, -1.0, -0.42, 0.0}) {
+    for (const auto max_val : {0.0, 0.42, 1.0, 4.2}) {
+      for (const auto inplace : {false, true}) {
+        auto x = torch::linspace(-10.0, 10.0, size * size * size);
+        x.resize_({size, size, size});
+        auto y_exp = (x < min_val) * min_val +
+                     ((x >= min_val) * (x <= max_val)) * x +
+                     (x > max_val) * max_val;
+        auto y = F::hardtanh(x,HardtanhOptions().min_val(min_val)
+          .max_val(max_val).inplace(inplace));
+
+        ASSERT_EQ(y.ndimension(), 3);
+        ASSERT_EQ(y.sizes(), torch::IntArrayRef({size, size, size}));
+        ASSERT_TRUE(torch::allclose(y, y_exp));
+        if (inplace) {
+          ASSERT_TRUE(torch::allclose(x, y_exp));
+        }
+      }
+    }
+  }
+}
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 7b8bfe6e22eb6..53e59e29ff0f3 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -1044,6 +1044,42 @@ TEST_F(ModulesTest, Hardshrink) {
   }
 }
 
+TEST_F(ModulesTest, Hardtanh) {
+  const auto size = 3;
+  for (const auto min_val : {-4.2, -1.0, -0.42, 0.0}) {
+    for (const auto max_val : {0.42, 1.0, 4.2}) {
+      Hardtanh model {HardtanhOptions().min_val(min_val).max_val(max_val)};
+      auto x = torch::linspace(-10.0, 10.0, size * size * size);
+      x.resize_({size, size, size}).set_requires_grad(true);
+      auto y = model(x);
+      torch::Tensor s = y.sum();
+
+      s.backward();
+      ASSERT_EQ(s.ndimension(), 0);
+
+      ASSERT_EQ(y.ndimension(), 3);
+      ASSERT_EQ(y.sizes(), torch::IntArrayRef({size, size, size}));
+      auto y_exp = (x < min_val) * min_val +
+                   ((x >= min_val) * (x <= max_val)) * x +
+                   (x > max_val) * max_val;
+      ASSERT_TRUE(torch::allclose(y, y_exp));
+    }
+  }
+}
+
+TEST_F(ModulesTest, HardtanhMinValGEMaxVal) {
+  ASSERT_THROWS_WITH(Hardtanh{HardtanhOptions().min_val(0.42).max_val(0.42)},
+                     "max_val must be greater than min_val");
+  ASSERT_THROWS_WITH(Hardtanh{HardtanhOptions().min_val(0.42).max_val(-0.42)},
+                     "max_val must be greater than min_val");
+
+  Hardtanh ht {HardtanhOptions().min_val(-0.42).max_val(0.42)};
+  ht->options.min_val(0.42);
+  ASSERT_THROWS_WITH(ht->reset(), "max_val must be greater than min_val");
+  ht->options.max_val(-0.42);
+  ASSERT_THROWS_WITH(ht->reset(), "max_val must be greater than min_val");
+}
+
 TEST_F(ModulesTest, PrettyPrintIdentity) {
   ASSERT_EQ(c10::str(Identity()), "torch::nn::Identity()");
 }
@@ -1278,3 +1314,11 @@ TEST_F(ModulesTest, PrettyPrintHardshrink) {
   ASSERT_EQ(c10::str(Hardshrink(HardshrinkOptions().lambda(42.42))),
             "torch::nn::Hardshrink(42.42)");
 }
+
+TEST_F(ModulesTest, PrettyPrintHardtanh) {
+  ASSERT_EQ(c10::str(Hardtanh()),
+    "torch::nn::Hardtanh(min_val=-1, max_val=1)");
+  ASSERT_EQ(c10::str(Hardtanh(
+      HardtanhOptions().min_val(-42.42).max_val(0.42).inplace(true))),
+    "torch::nn::Hardtanh(min_val=-42.42, max_val=0.42, inplace=true)");
+}
diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h
index d9688b5b3a88d..cf3648e558191 100644
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -20,6 +20,14 @@ inline Tensor hardshrink(const Tensor& input,
   return torch::hardshrink(input, options.lambda());
 }
 
+inline Tensor hardtanh(Tensor& input, const HardtanhOptions& options) {
+  if (options.inplace()) {
+    return torch::hardtanh_(input, options.min_val(), options.max_val());
+  } else {
+    return torch::hardtanh(input, options.min_val(), options.max_val());
+  }
+}
+
 } // namespace functional
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h
index bc7c1c3b28a5b..70f568136d079 100644
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -55,5 +55,28 @@ class TORCH_API HardshrinkImpl : public torch::nn::Cloneable<HardshrinkImpl> {
 
 TORCH_MODULE(Hardshrink);
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Hardtanh ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the HardTanh function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.Hardtanh to learn
+/// about the exact behavior of this module.
+class TORCH_API HardtanhImpl : public torch::nn::Cloneable<HardtanhImpl> {
+ public:
+  HardtanhImpl() : HardtanhImpl(HardtanhOptions()) {}
+  explicit HardtanhImpl(const HardtanhOptions& options_);
+
+  Tensor forward(Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `Hardtanh` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  HardtanhOptions options;
+};
+
+TORCH_MODULE(Hardtanh);
+
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h
index 8ec2eafe9cf93..43067e611980a 100644
--- a/torch/csrc/api/include/torch/nn/options/activation.h
+++ b/torch/csrc/api/include/torch/nn/options/activation.h
@@ -27,5 +27,21 @@ struct TORCH_API HardshrinkOptions {
   TORCH_ARG(double, lambda);
 };
 
+// ============================================================================
+
+/// Options for Hardtanh functional and module.
+struct HardtanhOptions {
+  HardtanhOptions() {}
+
+  /// minimum value of the linear region range. Default: -1
+  TORCH_ARG(double, min_val) = -1.0;
+
+  /// maximum value of the linear region range. Default: 1
+  TORCH_ARG(double, max_val) = 1.0;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index e06bfad248ede..40c7354caaee6 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -38,5 +38,31 @@ void HardshrinkImpl::pretty_print(std::ostream& stream) const {
          << "torch::nn::Hardshrink(" << options.lambda() << ")";
 }
 
+// ============================================================================
+
+HardtanhImpl::HardtanhImpl(const HardtanhOptions& options_)
+    : options(options_) {
+  reset();
+}
+
+Tensor HardtanhImpl::forward(Tensor& input) {
+  return F::hardtanh(input, options);
+}
+
+void HardtanhImpl::reset() {
+  TORCH_CHECK(options.max_val() > options.min_val(),
+              "max_val must be greater than min_val");
+}
+
+void HardtanhImpl::pretty_print(std::ostream& stream) const {
+  stream << std::boolalpha
+         << "torch::nn::Hardtanh(min_val=" << options.min_val()
+         << ", max_val=" << options.max_val();
+  if (options.inplace()) {
+    stream << std::boolalpha  << ", inplace=" << options.inplace();
+  }
+  stream << ")";
+}
+
 } // namespace nn
 } // namespace torch

From f4c37e6b32407b2b84d117c89877218a66629d28 Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Fri, 4 Oct 2019 13:05:18 -0700
Subject: [PATCH 066/113] fix OSX CI build (#27373)

Summary:
fix OSX caffe2 CI build, attempt 1
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27373

Differential Revision: D17768461

Pulled By: soumith

fbshipit-source-id: b0a076c07382327730b5d86b8a00f5388c368b5e
---
 .circleci/config.yml                            | 10 +++-------
 .circleci/verbatim-sources/caffe2-job-specs.yml | 10 +++-------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4304000f9b515..fb1808599691e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -551,12 +551,7 @@ jobs:
             # Reinitialize path (see man page for path_helper(8))
             eval `/usr/libexec/path_helper -s`
 
-            # Use Homebrew Python if configured to do so
-            if [ "${PYTHON_INSTALLATION}" == "homebrew" ]; then
-              export PATH=/usr/local/opt/python/libexec/bin:/usr/local/bin:$PATH
-            fi
-
-            pip -q install numpy
+            export PATH=/usr/local/opt/python/libexec/bin:/usr/local/bin:$PATH
 
             # Install Anaconda if we need to
             if [ -n "${CAFFE2_USE_ANACONDA}" ]; then
@@ -569,6 +564,8 @@ jobs:
               source ${TMPDIR}/anaconda/bin/activate
             fi
 
+            pip -q install numpy
+
             # Install sccache
             sudo curl https://s3.amazonaws.com/ossci-macos/sccache --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
@@ -606,7 +603,6 @@ jobs:
             if which sccache > /dev/null; then
               sccache --show-stats
             fi
-
   binary_linux_build:
     <<: *binary_linux_build_params
     steps:
diff --git a/.circleci/verbatim-sources/caffe2-job-specs.yml b/.circleci/verbatim-sources/caffe2-job-specs.yml
index 4e46a269e523c..09536c51394ac 100644
--- a/.circleci/verbatim-sources/caffe2-job-specs.yml
+++ b/.circleci/verbatim-sources/caffe2-job-specs.yml
@@ -146,12 +146,7 @@
             # Reinitialize path (see man page for path_helper(8))
             eval `/usr/libexec/path_helper -s`
 
-            # Use Homebrew Python if configured to do so
-            if [ "${PYTHON_INSTALLATION}" == "homebrew" ]; then
-              export PATH=/usr/local/opt/python/libexec/bin:/usr/local/bin:$PATH
-            fi
-
-            pip -q install numpy
+            export PATH=/usr/local/opt/python/libexec/bin:/usr/local/bin:$PATH
 
             # Install Anaconda if we need to
             if [ -n "${CAFFE2_USE_ANACONDA}" ]; then
@@ -164,6 +159,8 @@
               source ${TMPDIR}/anaconda/bin/activate
             fi
 
+            pip -q install numpy
+
             # Install sccache
             sudo curl https://s3.amazonaws.com/ossci-macos/sccache --output /usr/local/bin/sccache
             sudo chmod +x /usr/local/bin/sccache
@@ -201,4 +198,3 @@
             if which sccache > /dev/null; then
               sccache --show-stats
             fi
-

From 646e214706fd940ae36c51cdc8d66789962203ff Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Fri, 4 Oct 2019 13:27:19 -0700
Subject: [PATCH 067/113] ProcessGroupNCCL should respect timeout passed in to
 init_process_group. (#27224)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27224

As part of adding error handling to NCCL, we are now able to specify a
timeout for operations using ProcessGroupNCCL. Although, this timeout had a
default of 10 seconds and didn't respect the timeout specified in
init_process_group.

In this change, I've ensured we pass the appropriate timeout to
ProcessGroupNCCL.
ghstack-source-id: 91283548

Test Plan:
Added unit test to verify timeout passed in to init_process_group is
respected.

Differential Revision: D17717992

fbshipit-source-id: c73320187f1f3b2693ba1e177d80646e282d01a2
---
 test/test_c10d.py                     | 124 ++++++++++++++++++--------
 torch/distributed/distributed_c10d.py |  10 ++-
 2 files changed, 92 insertions(+), 42 deletions(-)

diff --git a/test/test_c10d.py b/test/test_c10d.py
index 5e9584f94f5dc..d850aba95d56a 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -2952,10 +2952,9 @@ def test_multi_limit_multi_dtype(self):
         result = dist._compute_bucket_assignment_by_size(tensors, [200, 400])
         self.assertEqual([[0], [1], [2, 4], [3, 5]], result)
 
-
-class CommTest(MultiProcessTestCase):
+class NcclErrorHandlingTest(MultiProcessTestCase):
     def setUp(self):
-        super(CommTest, self).setUp()
+        super(NcclErrorHandlingTest, self).setUp()
         # Need to skip return code checking for these tests since the child
         # processes don't exit cleanly.
         self.skip_return_code_checks = [
@@ -2966,17 +2965,8 @@ def setUp(self):
         ]
         self._fork_processes()
 
-    def _get_wrapped_func(self, func):
-        # Get the original function which was wrapped in the decorator.
-        if hasattr(func, '__wrapped__'):
-            # py3 way.
-            return func.__wrapped__
-        else:
-            # py2 way.
-            return func.func_closure[0].cell_contents
-
     def tearDown(self):
-        super(CommTest, self).tearDown()
+        super(NcclErrorHandlingTest, self).tearDown()
         try:
             os.remove(self.file_name)
         except OSError:
@@ -2990,33 +2980,14 @@ def op_timeout_sec(self):
     def world_size(self):
         return 2
 
-    def _test_broadcast_coalesced(self, process_group, device):
-        half = torch.float16
-
-        # No support for float16 for CPU tensors
-        if device == torch.device('cpu'):
-            half = torch.float32
-
-        target = torch.arange(60, dtype=half, device=device).chunk(5)
-        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
-        target += torch.arange(60, dtype=half, device=device).chunk(5)
-        target += torch.arange(60, dtype=torch.float64, device=device).chunk(5)
-        target += torch.arange(60, dtype=half, device=device).chunk(5)
-        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
-
-        # The tensors to pass to broadcast are idential to the target
-        # only on the process that is the root of the broadcast.
-        if self.rank == 0:
-            tensors = list(tensor.clone() for tensor in target)
+    def _get_wrapped_func(self, func):
+        # Get the original function which was wrapped in the decorator.
+        if hasattr(func, '__wrapped__'):
+            # py3 way.
+            return func.__wrapped__
         else:
-            tensors = list(torch.empty_like(tensor) for tensor in target)
-
-        c10d._broadcast_coalesced(
-            process_group,
-            tensors,
-            buffer_size=256)
-
-        self.assertEqual(tensors, target)
+            # py2 way.
+            return func.func_closure[0].cell_contents
 
     def _run_all_reduce(self, pg):
         pg.allreduce(torch.rand(10).cuda(self.rank))
@@ -3098,6 +3069,81 @@ def test_invalid_nccl_blocking_wait_env(self):
         self._run_invalid_nccl_blocking_wait_env('2147483647')
         self._run_invalid_nccl_blocking_wait_env('4294967295')
 
+    @requires_nccl()
+    @skip_if_not_multigpu
+    def test_nccl_timeout(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        os.environ["NCCL_BLOCKING_WAIT"] = "1"
+
+        # Initialize process_group.
+        timeout = 1
+        c10d.distributed_c10d.init_process_group(
+            backend=dist.Backend.NCCL, store=store, world_size=2, rank=self.rank,
+            timeout=timedelta(seconds=timeout))
+        c10d.distributed_c10d.all_reduce(torch.rand(10).cuda(self.rank))
+
+        if self.rank == 0:
+            # This should timeout in about 1 second.
+            start = time.time()
+            with self.assertRaises(RuntimeError):
+                c10d.distributed_c10d.all_reduce(torch.rand(10).cuda(self.rank))
+
+            total_time = time.time() - start
+
+            self.assertLess(abs(total_time - timeout), 0.5)
+        else:
+            # Ensure the other rank sleeps to trigger timeout.
+            time.sleep(2 * timeout)
+
+
+class CommTest(MultiProcessTestCase):
+    def setUp(self):
+        super(CommTest, self).setUp()
+        self._fork_processes()
+
+    def tearDown(self):
+        super(CommTest, self).tearDown()
+        try:
+            os.remove(self.file_name)
+        except OSError:
+            pass
+
+    @property
+    def op_timeout_sec(self):
+        return 1
+
+    @property
+    def world_size(self):
+        return 2
+
+    def _test_broadcast_coalesced(self, process_group, device):
+        half = torch.float16
+
+        # No support for float16 for CPU tensors
+        if device == torch.device('cpu'):
+            half = torch.float32
+
+        target = torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
+        target += torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float64, device=device).chunk(5)
+        target += torch.arange(60, dtype=half, device=device).chunk(5)
+        target += torch.arange(60, dtype=torch.float32, device=device).chunk(5)
+
+        # The tensors to pass to broadcast are idential to the target
+        # only on the process that is the root of the broadcast.
+        if self.rank == 0:
+            tensors = list(tensor.clone() for tensor in target)
+        else:
+            tensors = list(torch.empty_like(tensor) for tensor in target)
+
+        c10d._broadcast_coalesced(
+            process_group,
+            tensors,
+            buffer_size=256)
+
+        self.assertEqual(tensors, target)
+
     @requires_nccl()
     @skip_if_not_multigpu
     def test_broadcast_coalesced_nccl(self):
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index f4b455b9daf7c..098d9d5b554a5 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -129,7 +129,8 @@ class GroupMember(object):
 _default_pg_init_method = None
 
 # Default process group wide timeout, if applicable.
-# This currently only applies to the gloo backend. To make an attempt at
+# This only applies to the gloo and nccl backends
+# (only if NCCL_BLOCKING_WAIT is set to 1). To make an attempt at
 # backwards compatibility with THD, we use an extraordinarily high default
 # timeout, given that THD did not have timeouts.
 _default_pg_timeout = timedelta(minutes=30)
@@ -346,7 +347,9 @@ def init_process_group(backend,
                                 Mutually exclusive with ``init_method``.
         timeout (timedelta, optional): Timeout for operations executed against
             the process group. Default value equals 30 minutes.
-            This is only applicable for the ``gloo`` backend.
+            This is applicable for the ``gloo`` backend. For ``nccl``, this is
+            applicable only if the environment variable ``NCCL_BLOCKING_WAIT``
+            is set to 1.
         group_name (str, optional, deprecated): Group name.
 
     To enable ``backend == Backend.MPI``, PyTorch needs to built from source
@@ -485,7 +488,8 @@ def _new_process_group_helper(world_size,
             pg = ProcessGroupNCCL(
                 prefix_store,
                 rank,
-                world_size)
+                world_size,
+                timeout)
             _pg_map[pg] = (Backend.NCCL, store)
             _pg_names[pg] = group_name
         else:

From badb08d577d74936af51f6e4c1cc0906b4a4b616 Mon Sep 17 00:00:00 2001
From: Rohan Varma <rvarm1@fb.com>
Date: Fri, 4 Oct 2019 13:47:49 -0700
Subject: [PATCH 068/113] Add clip_grad_norm_ to c++ api (#26140)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26140

Per https://github.com/pytorch/pytorch/issues/25883, we want to work
towards C++/Python API parity. This diff adds clip_grad_norm_ to the c++ API to
improve parity.

ghstack-source-id: 91334333
ghstack-source-id: 91334333

Test Plan: Added a unit test

Differential Revision: D17312367

fbshipit-source-id: 753ba3a4d084d01f3cc8919da3108e67c809ad65
---
 setup.py                                      |   1 +
 test/cpp/api/CMakeLists.txt                   |   1 +
 test/cpp/api/nn_utils.cpp                     | 103 ++++++++++++++++++
 torch/csrc/api/include/torch/nn.h             |   1 +
 torch/csrc/api/include/torch/nn/utils.h       |   3 +
 .../api/include/torch/nn/utils/clip_grad.h    |  62 +++++++++++
 6 files changed, 171 insertions(+)
 create mode 100644 test/cpp/api/nn_utils.cpp
 create mode 100644 torch/csrc/api/include/torch/nn/utils.h
 create mode 100644 torch/csrc/api/include/torch/nn/utils/clip_grad.h

diff --git a/setup.py b/setup.py
index 1850db6600905..7ada2774a1aa9 100644
--- a/setup.py
+++ b/setup.py
@@ -831,6 +831,7 @@ def print_box(msg):
                 'include/torch/csrc/api/include/torch/nn/modules/*.h',
                 'include/torch/csrc/api/include/torch/nn/modules/container/*.h',
                 'include/torch/csrc/api/include/torch/nn/parallel/*.h',
+                'include/torch/csrc/api/include/torch/nn/utils/*.h',
                 'include/torch/csrc/api/include/torch/optim/*.h',
                 'include/torch/csrc/api/include/torch/serialize/*.h',
                 'include/torch/csrc/autograd/*.h',
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index 2b933817f6cc7..8621a581e68fc 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -14,6 +14,7 @@ set(TORCH_API_TEST_SOURCES
   ${TORCH_API_TEST_DIR}/module.cpp
   ${TORCH_API_TEST_DIR}/modulelist.cpp
   ${TORCH_API_TEST_DIR}/modules.cpp
+  ${TORCH_API_TEST_DIR}/nn_utils.cpp
   ${TORCH_API_TEST_DIR}/optim.cpp
   ${TORCH_API_TEST_DIR}/ordered_dict.cpp
   ${TORCH_API_TEST_DIR}/rnn.cpp
diff --git a/test/cpp/api/nn_utils.cpp b/test/cpp/api/nn_utils.cpp
new file mode 100644
index 0000000000000..fa0693ec3ec3f
--- /dev/null
+++ b/test/cpp/api/nn_utils.cpp
@@ -0,0 +1,103 @@
+#include <gtest/gtest.h>
+
+#include <torch/torch.h>
+
+#include <test/cpp/api/support.h>
+
+using namespace torch::nn;
+using namespace torch::test;
+
+struct NNUtilsTest : torch::test::SeedingFixture {};
+
+TEST_F(NNUtilsTest, ClipGradNorm) {
+  auto linear_layer = Linear(10, 10);
+  float max_norm = 2;
+  auto compute_norm = [linear_layer](float norm_type) -> float {
+    float total_norm = 0.0;
+    if (norm_type != std::numeric_limits<float>::infinity()) {
+      for (const auto& p : linear_layer->parameters()) {
+        total_norm +=
+            p.grad().data().abs().pow(norm_type).sum().item().toFloat();
+      }
+      return std::pow(total_norm, 1.0 / norm_type);
+    } else {
+      for (const auto& p : linear_layer->parameters()) {
+        auto param_max = p.grad().data().abs().max().item().toFloat();
+        if (param_max > total_norm) {
+          total_norm = param_max;
+        }
+      }
+      return total_norm;
+    }
+  };
+  auto compare_scaling =
+      [linear_layer](const std::vector<torch::Tensor>& grads) -> torch::Tensor {
+    std::vector<torch::Tensor> p_scale;
+    for (int i = 0; i < grads.size(); i++) {
+      auto param = linear_layer->parameters()[i];
+      auto grad = grads[i];
+      p_scale.push_back(param.grad().data().div(grad).view(-1));
+    }
+    auto scale = torch::cat(p_scale);
+    return scale; // need to assert std is 0.
+  };
+
+  std::vector<torch::Tensor> grads = {
+      torch::arange(1.0, 101).view({10, 10}),
+      torch::ones(10).div(1000),
+  };
+  std::vector<float> norm_types = {
+      0.5,
+      1.5,
+      2.0,
+      4.0,
+      std::numeric_limits<float>::infinity(),
+  };
+  for (auto norm_type : norm_types) {
+    for (int i = 0; i < grads.size(); i++) {
+      linear_layer->parameters()[i].grad() =
+          grads[i].clone().view_as(linear_layer->parameters()[i].data());
+    }
+    auto norm_before = compute_norm(norm_type);
+    auto layer_params = linear_layer->parameters();
+    auto norm = utils::clip_grad_norm_(layer_params, max_norm, norm_type);
+    auto norm_after = compute_norm(norm_type);
+    ASSERT_FLOAT_EQ(norm, norm_before);
+    ASSERT_FLOAT_EQ(norm_after, max_norm);
+    ASSERT_LE(norm_after, max_norm);
+    auto scaled = compare_scaling(grads);
+    ASSERT_NEAR(0, scaled.std().item().toFloat(), 1e-7);
+  }
+  // Small gradients should be lefted unchanged
+  grads = {
+      torch::rand({10, 10}).div(10000),
+      torch::ones(10).div(500),
+  };
+  for (auto norm_type : norm_types) {
+    for (int i = 0; i < grads.size(); i++) {
+      linear_layer->parameters()[i].grad().data().copy_(grads[i]);
+    }
+    auto norm_before = compute_norm(norm_type);
+    auto layer_params = linear_layer->parameters();
+    auto norm = utils::clip_grad_norm_(layer_params, max_norm, norm_type);
+    auto norm_after = compute_norm(norm_type);
+    ASSERT_FLOAT_EQ(norm, norm_before);
+    ASSERT_FLOAT_EQ(norm_before, norm_after);
+    ASSERT_LE(norm_after, max_norm);
+    auto scaled = compare_scaling(grads);
+    ASSERT_NEAR(0, scaled.std().item().toFloat(), 1e-7);
+    ASSERT_EQ(scaled[0].item().toFloat(), 1);
+  }
+  // should accept a single tensor as input
+  auto p1 = torch::randn({10, 10});
+  auto p2 = torch::randn({10, 10});
+  auto g = torch::arange(1., 101).view({10, 10});
+  p1.grad() = g.clone();
+  p2.grad() = g.clone();
+  for (const auto norm_type : norm_types) {
+    utils::clip_grad_norm_(p1, max_norm, norm_type);
+    std::vector<torch::Tensor> params = {p2};
+    utils::clip_grad_norm_(params, max_norm, norm_type);
+    ASSERT_TRUE(torch::allclose(p1.grad(), p2.grad()));
+  }
+}
diff --git a/torch/csrc/api/include/torch/nn.h b/torch/csrc/api/include/torch/nn.h
index 10474512a2580..b93220b5d62a0 100644
--- a/torch/csrc/api/include/torch/nn.h
+++ b/torch/csrc/api/include/torch/nn.h
@@ -7,3 +7,4 @@
 #include <torch/nn/modules.h>
 #include <torch/nn/options.h>
 #include <torch/nn/pimpl.h>
+#include <torch/nn/utils.h>
diff --git a/torch/csrc/api/include/torch/nn/utils.h b/torch/csrc/api/include/torch/nn/utils.h
new file mode 100644
index 0000000000000..550949ef08666
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/utils.h
@@ -0,0 +1,3 @@
+#pragma once
+
+#include <torch/nn/utils/clip_grad.h>
diff --git a/torch/csrc/api/include/torch/nn/utils/clip_grad.h b/torch/csrc/api/include/torch/nn/utils/clip_grad.h
new file mode 100644
index 0000000000000..fea0fe0a9c9f4
--- /dev/null
+++ b/torch/csrc/api/include/torch/nn/utils/clip_grad.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include <torch/csrc/WindowsTorchApiMacro.h>
+
+namespace torch {
+namespace nn {
+namespace utils {
+
+// Clips gradient norm of a vector of Tensors.
+// See
+// https://pytorch.org/docs/stable/nn.html?highlight=clip_grad_norm#torch.nn.utils.clip_grad_norm_
+// for more details about this module.
+inline float clip_grad_norm_(
+    std::vector<Tensor>& parameters,
+    float max_norm,
+    float norm_type = 2.0) {
+  std::vector<Tensor> params_with_grad;
+
+  for (const auto& param : parameters) {
+    auto& grad = param.grad();
+    if (grad.defined()) {
+      params_with_grad.push_back(param);
+    }
+  }
+  float total_norm = 0.0;
+  if (norm_type == std::numeric_limits<float>::infinity()) {
+    for (const auto& param : params_with_grad) {
+      auto param_max = param.grad().data().abs().max().item().toFloat();
+      if (param_max > total_norm) {
+        total_norm = param_max;
+      }
+    }
+  } else {
+    for (const auto& param : params_with_grad) {
+      auto param_norm = param.grad().data().norm(norm_type);
+      total_norm += std::pow(param_norm.item().toFloat(), norm_type);
+    }
+    total_norm = std::pow(total_norm, 1.0 / norm_type);
+  }
+
+  auto clip_coef = max_norm / (total_norm + 1e-6);
+  if (clip_coef < 1) {
+    for (auto& param : params_with_grad) {
+      param.grad().data().mul_(clip_coef);
+    }
+  }
+  return total_norm;
+}
+
+// A wrapper around clip_grad_norm_ that allows us to call the function with a
+// single Tensor.
+inline float clip_grad_norm_(
+    Tensor& parameters,
+    float max_norm,
+    float norm_type = 2.0) {
+  std::vector<Tensor> params = {parameters};
+  return clip_grad_norm_(params, max_norm, norm_type);
+}
+
+} // namespace utils
+} // namespace nn
+} // namespace torch

From 8b61a220c0c5a571fdc0110b5ec17ccf79c845f2 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Fri, 4 Oct 2019 14:14:52 -0700
Subject: [PATCH 069/113] C++ API parity: LeakyReLU

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27059

Test Plan: Imported from OSS

Differential Revision: D17682407

Pulled By: pbelevich

fbshipit-source-id: 2a4f42e9438799ba8de7282ac7a6fd3ff97ee048
---
 test/cpp/api/functional.cpp                   | 20 ++++++++++++++
 test/cpp/api/modules.cpp                      | 27 +++++++++++++++++++
 .../include/torch/nn/functional/activation.h  |  8 ++++++
 .../api/include/torch/nn/modules/activation.h | 23 ++++++++++++++++
 .../api/include/torch/nn/options/activation.h | 13 +++++++++
 torch/csrc/api/src/nn/modules/activation.cpp  | 20 ++++++++++++++
 6 files changed, 111 insertions(+)

diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index 37b1cab932457..cc694973445e9 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -327,3 +327,23 @@ TEST_F(FunctionalTest, Hardtanh) {
     }
   }
 }
+
+TEST_F(FunctionalTest, LeakyReLU) {
+  const auto size = 3;
+  for (const auto negative_slope : {0.0, 0.42, 1.0}) {
+    for (const auto inplace : {false, true}) {
+      auto x = torch::linspace(-10.0, 10.0, size * size * size);
+      x.resize_({size, size, size});
+      auto y_exp = (x < 0) * x * negative_slope + (x >= 0) * x;
+      auto y = F::leaky_relu(x, LeakyReLUOptions()
+        .negative_slope(negative_slope).inplace(inplace));
+
+      ASSERT_EQ(y.ndimension(), 3);
+      ASSERT_EQ(y.sizes(), torch::IntArrayRef({size, size, size}));
+      ASSERT_TRUE(torch::allclose(y, y_exp));
+      if (inplace) {
+        ASSERT_TRUE(torch::allclose(x, y_exp));
+      }
+    }
+  }
+}
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 53e59e29ff0f3..064082fbd0360 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -1080,6 +1080,25 @@ TEST_F(ModulesTest, HardtanhMinValGEMaxVal) {
   ASSERT_THROWS_WITH(ht->reset(), "max_val must be greater than min_val");
 }
 
+TEST_F(ModulesTest, LeakyReLU) {
+  const auto size = 3;
+  for (const auto negative_slope : {0.0, 0.42, 1.0}) {
+    LeakyReLU model {LeakyReLUOptions().negative_slope(negative_slope)};
+    auto x = torch::linspace(-10.0, 10.0, size * size * size);
+    x.resize_({size, size, size}).set_requires_grad(true);
+    auto y = model(x);
+    torch::Tensor s = y.sum();
+
+    s.backward();
+    ASSERT_EQ(s.ndimension(), 0);
+
+    ASSERT_EQ(y.ndimension(), 3);
+    ASSERT_EQ(y.sizes(), torch::IntArrayRef({size, size, size}));
+    auto y_exp = (x < 0) * x * negative_slope + (x >= 0) * x;
+    ASSERT_TRUE(torch::allclose(y, y_exp));
+  }
+}
+
 TEST_F(ModulesTest, PrettyPrintIdentity) {
   ASSERT_EQ(c10::str(Identity()), "torch::nn::Identity()");
 }
@@ -1322,3 +1341,11 @@ TEST_F(ModulesTest, PrettyPrintHardtanh) {
       HardtanhOptions().min_val(-42.42).max_val(0.42).inplace(true))),
     "torch::nn::Hardtanh(min_val=-42.42, max_val=0.42, inplace=true)");
 }
+
+TEST_F(ModulesTest, PrettyPrintLeakyReLU) {
+  ASSERT_EQ(c10::str(LeakyReLU()),
+    "torch::nn::LeakyReLU(negative_slope=0.01)");
+  ASSERT_EQ(c10::str(LeakyReLU(
+      LeakyReLUOptions().negative_slope(0.42).inplace(true))),
+    "torch::nn::LeakyReLU(negative_slope=0.42, inplace=true)");
+}
diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h
index cf3648e558191..684d57b731b89 100644
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -28,6 +28,14 @@ inline Tensor hardtanh(Tensor& input, const HardtanhOptions& options) {
   }
 }
 
+inline Tensor leaky_relu(Tensor& input, const LeakyReLUOptions& options) {
+  if (options.inplace()) {
+    return torch::leaky_relu_(input, options.negative_slope());
+  } else {
+    return torch::leaky_relu(input, options.negative_slope());
+  }
+}
+
 } // namespace functional
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h
index 70f568136d079..36d84641a4a51 100644
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -78,5 +78,28 @@ class TORCH_API HardtanhImpl : public torch::nn::Cloneable<HardtanhImpl> {
 
 TORCH_MODULE(Hardtanh);
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LeakyReLU ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LeakyReLU function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LeakyReLU to learn
+/// about the exact behavior of this module.
+class TORCH_API LeakyReLUImpl : public torch::nn::Cloneable<LeakyReLUImpl> {
+ public:
+  LeakyReLUImpl() : LeakyReLUImpl(LeakyReLUOptions()) {}
+  explicit LeakyReLUImpl(const LeakyReLUOptions& options_);
+
+  Tensor forward(Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LeakyReLU` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+
+  /// The options with which this `Module` was constructed.
+  LeakyReLUOptions options;
+};
+
+TORCH_MODULE(LeakyReLU);
+
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/options/activation.h b/torch/csrc/api/include/torch/nn/options/activation.h
index 43067e611980a..592f3c61e1c43 100644
--- a/torch/csrc/api/include/torch/nn/options/activation.h
+++ b/torch/csrc/api/include/torch/nn/options/activation.h
@@ -43,5 +43,18 @@ struct HardtanhOptions {
   TORCH_ARG(bool, inplace) = false;
 };
 
+// ============================================================================
+
+/// Options for LeakyReLU functional and module.
+struct LeakyReLUOptions {
+  LeakyReLUOptions() {}
+
+  /// Controls the angle of the negative slope. Default: 1e-2
+  TORCH_ARG(double, negative_slope) = 1e-2;
+
+  /// can optionally do the operation in-place. Default: False
+  TORCH_ARG(bool, inplace) = false;
+};
+
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index 40c7354caaee6..bf3893297ee2d 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -64,5 +64,25 @@ void HardtanhImpl::pretty_print(std::ostream& stream) const {
   stream << ")";
 }
 
+// ============================================================================
+
+LeakyReLUImpl::LeakyReLUImpl(const LeakyReLUOptions& options_)
+    : options(options_) {}
+
+Tensor LeakyReLUImpl::forward(Tensor& input) {
+  return F::leaky_relu(input, options);
+}
+
+void LeakyReLUImpl::reset() {}
+
+void LeakyReLUImpl::pretty_print(std::ostream& stream) const {
+  stream << std::boolalpha
+         << "torch::nn::LeakyReLU(negative_slope=" << options.negative_slope();
+  if (options.inplace()) {
+    stream << std::boolalpha  << ", inplace=" << options.inplace();
+  }
+  stream << ")";
+}
+
 } // namespace nn
 } // namespace torch

From 3c2cd8cc109c440c03a83c1a27a205dbab2bca91 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Fri, 4 Oct 2019 14:41:34 -0700
Subject: [PATCH 070/113] Some hipify script cleanups (#27375)

Summary:
continue https://github.com/pytorch/pytorch/issues/26363
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27375

Differential Revision: D17764992

Pulled By: bddppq

fbshipit-source-id: ecc06521179677efcedb1d58ceda63df7d63627e
---
 tools/amd_build/build_amd.py                  | 30 -------------------
 .../pyHIPIFY/cuda_to_hip_mappings.py          |  3 +-
 2 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 3643ab931eef8..48e8ecceabdea 100644
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -4,8 +4,6 @@
 import os
 import subprocess
 import argparse
-from functools import reduce
-from itertools import chain
 
 from pyHIPIFY import hipify_python
 
@@ -97,7 +95,6 @@
     '*/hip/*',
     # These files are compatible with both cuda and hip
     "aten/src/ATen/core/*",
-    "torch/csrc/autograd/engine.cpp",
     # generated files we shouldn't frob
     "torch/lib/tmp_install/*",
     "torch/include/*",
@@ -109,33 +106,6 @@
     for filename in os.listdir(os.path.join(amd_build_dir, "patches")):
         subprocess.Popen(["git", "apply", os.path.join(patch_folder, filename)], cwd=proj_dir)
 
-    # Make various replacements inside AMD_BUILD/torch directory
-    ignore_files = [
-        # These files use nvrtc, hip doesn't have equivalent
-        "csrc/autograd/profiler.h",
-        "csrc/autograd/profiler.cpp",
-        # These files are compatible with both cuda and hip
-        "csrc/autograd/engine.cpp"
-    ]
-    paths = ("torch", "tools")
-    for root, _directories, files in chain.from_iterable(os.walk(path) for path in paths):
-        for filename in files:
-            if filename.endswith(".cpp") or filename.endswith(".h") or filename.endswith(".hpp"):
-                source = os.path.join(root, filename)
-                # Disabled files
-                if reduce(lambda result, exclude: source.endswith(exclude) or result, ignore_files, False):
-                    continue
-                # Update contents.
-                with open(source, "r+") as f:
-                    contents = f.read()
-                    contents = contents.replace("USE_CUDA", "USE_ROCM")
-                    contents = contents.replace("CUDA_VERSION", "0")
-                    f.seek(0)
-                    f.write(contents)
-                    f.truncate()
-                    f.flush()
-                    os.fsync(f)
-
 # Check if the compiler is hip-clang.
 def is_hip_clang():
     try:
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index f34d5a0f38d3a..e220e9e306138 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -494,7 +494,6 @@
     ("CUDA_ARRAY3D_LAYERED", ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
     ("CUDA_ARRAY3D_SURFACE_LDST", ("HIP_ARRAY3D_SURFACE_LDST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
     ("CUDA_ARRAY3D_TEXTURE_GATHER", ("HIP_ARRAY3D_TEXTURE_GATHER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    # ("CUDA_VERSION", ("HIP_VERSION", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
     ("CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK", ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
     ("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X", ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
     ("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y", ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
@@ -2251,6 +2250,8 @@
 ])
 
 PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict([
+    ("USE_CUDA", ("USE_ROCM", API_PYTORCH)),
+    ("CUDA_VERSION", ("HIP_VERSION", API_PYTORCH)),
     ("cudaHostAllocator", ("hipHostAllocator", API_PYTORCH)),
     ("cudaDeviceAllocator", ("hipDeviceAllocator", API_PYTORCH)),
     ("define MAX_NUM_BLOCKS 200", ("define MAX_NUM_BLOCKS 64", API_PYTORCH)),

From 6db0cc472cdd524223aba60fb357403cb0f2e13d Mon Sep 17 00:00:00 2001
From: J M Dieterich <dieterich@ogolem.org>
Date: Fri, 4 Oct 2019 14:44:20 -0700
Subject: [PATCH 071/113] add some support for the occupancy API on ROCm
 (#27390)

Summary:
Unfortunately, the HIP function takes uint32_t* instead of int*, so we still need to ifdef for the time being.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27390

Differential Revision: D17768832

Pulled By: bddppq

fbshipit-source-id: c65176660cb0783a04f0a4a064f686818d759589
---
 aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h  | 2 +-
 aten/src/ATen/native/cuda/SoftMax.cu       | 6 +++++-
 torch/csrc/jit/fuser/cuda/fused_kernel.cpp | 9 ++++++---
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
index 36b7bd99acffc..7f831e5cdf785 100644
--- a/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
+++ b/aten/src/ATen/cuda/nvrtc_stub/ATenNVRTC.h
@@ -61,7 +61,6 @@ namespace at { namespace cuda {
 // list above.
 //
 // HIP doesn't have
-//   cuOccupancyMaxActiveBlocksPerMultiprocessor
 //   cuGetErrorString  (maps to non-functional hipGetErrorString___)
 
 #define AT_FORALL_NVRTC(_)                       \
@@ -72,6 +71,7 @@ namespace at { namespace cuda {
   _(nvrtcGetPTX)                                 \
   _(cuModuleLoadData)                            \
   _(cuModuleGetFunction)                         \
+  _(cuOccupancyMaxActiveBlocksPerMultiprocessor) \
   _(nvrtcGetErrorString)                         \
   _(nvrtcGetProgramLogSize)                      \
   _(nvrtcGetProgramLog)                          \
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index ad77d95a86b4d..46ffdcaa27b4b 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -123,7 +123,11 @@ void SpatialSoftMax_getLaunchSizes(
   smem_size = block.x == 1 ? 0 : block_threads * sizeof(accscalar_t);
   int max_active_blocks;
 #ifdef __HIP_PLATFORM_HCC__
-  max_active_blocks = 16;
+  // XXX HIP function signature is not compatible yet.
+  uint32_t max_blocks;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks,
+                                                k, block_threads, smem_size);
+  max_active_blocks = max_blocks;
 #else
   cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
                                                 k, block_threads, smem_size);
diff --git a/torch/csrc/jit/fuser/cuda/fused_kernel.cpp b/torch/csrc/jit/fuser/cuda/fused_kernel.cpp
index 4f2f7ebb9b045..b46adaf34ab2c 100644
--- a/torch/csrc/jit/fuser/cuda/fused_kernel.cpp
+++ b/torch/csrc/jit/fuser/cuda/fused_kernel.cpp
@@ -141,13 +141,16 @@ FusedKernelCUDA::FusedKernelCUDA(
 
   // Computes max blocks
 #ifdef __HIP_PLATFORM_HCC__
-  // XXX this is a temporary hack until the occupancy API is supported in ROCm
-  maxBlocks_ = 16 * prop_->multiProcessorCount;
+  // XXX HIP function signature is not compatible yet
+  uint32_t max_blocks;
+  AT_CUDA_DRIVER_CHECK(nvrtc().cuOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_blocks, function_, 128, 0));
+  maxBlocks_ = max_blocks;
 #else
   AT_CUDA_DRIVER_CHECK(nvrtc().cuOccupancyMaxActiveBlocksPerMultiprocessor(
       &maxBlocks_, function_, 128, 0));
-  maxBlocks_ *= prop_->multiProcessorCount;
 #endif
+maxBlocks_ *= prop_->multiProcessorCount;
 
   // Resets device (end of hacked at::DeviceGuard)
   at::cuda::set_device(prior_device);

From 32d009a37f024684dd55041cb3b14bb6fe7de01f Mon Sep 17 00:00:00 2001
From: J M Dieterich <dieterich@ogolem.org>
Date: Fri, 4 Oct 2019 14:47:07 -0700
Subject: [PATCH 072/113] Add gfx908 to the list of per-default compiled
 architectures. (#27388)

Summary:
ROCm 2.8 added preliminary support for gfx908.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27388

Differential Revision: D17767772

Pulled By: bddppq

fbshipit-source-id: 172daf5bb66d3db86a13e287059af4b9b90a7f57
---
 cmake/public/LoadHIP.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index 414b2be3afbae..e552ade492b68 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -102,7 +102,7 @@ ELSE()
 ENDIF()
 
 IF(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
-  SET(PYTORCH_ROCM_ARCH gfx803;gfx900;gfx906)
+  SET(PYTORCH_ROCM_ARCH gfx803;gfx900;gfx906;gfx908)
 ELSE()
   SET(PYTORCH_ROCM_ARCH $ENV{PYTORCH_ROCM_ARCH})
 ENDIF()

From 18215337f44e39bc8d237186684e9058595f764e Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Fri, 4 Oct 2019 16:12:24 -0700
Subject: [PATCH 073/113] Change nightly builds version to 1.4.0-SNAPSHOT
 (#27381)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27381

Changing android nightly builds from master to version 1.4.0-SNAPSHOT, as we also have 1.3.0-SNAPSHOT from the branch v1.3.0

Test Plan: Imported from OSS

Differential Revision: D17773620

Pulled By: IvanKobzarev

fbshipit-source-id: c39a1dbf5e06f79c25367c3bc602cc8ce42cd939
---
 android/gradle.properties | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/android/gradle.properties b/android/gradle.properties
index ec9e4008fa562..ff63986f2d6de 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=0.0.7-SNAPSHOT
+VERSION_NAME=1.4.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 POM_URL=https://github.com/pytorch/pytorch/tree/master/android

From 92a2caa028d90a67e70e883bb411073640bab8e0 Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Fri, 4 Oct 2019 16:19:41 -0700
Subject: [PATCH 074/113] Pickup proxy parameters for publishing (#27389)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27389

Pickup gradle proxy parameters (handy for publishing from devserver) in maven publishing gradle plugin

Test Plan: Imported from OSS

Differential Revision: D17773548

Pulled By: IvanKobzarev

fbshipit-source-id: 662c0b2835e6cf1e4009da79e27268d4a19c2ceb
---
 android/gradle/gradle_maven_push.gradle | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/android/gradle/gradle_maven_push.gradle b/android/gradle/gradle_maven_push.gradle
index c1660c7fddf90..5fdd8fbc6a037 100644
--- a/android/gradle/gradle_maven_push.gradle
+++ b/android/gradle/gradle_maven_push.gradle
@@ -25,6 +25,18 @@ def getRepositoryPassword() {
   return hasProperty('SONATYPE_NEXUS_PASSWORD') ? SONATYPE_NEXUS_PASSWORD : ""
 }
 
+def getHttpProxyHost() {
+  return project.properties['systemProp.http.proxyHost']
+}
+
+def getHttpProxyPort() {
+  return project.properties['systemProp.http.proxyPort']
+}
+
+def needProxy() {
+  return (getHttpProxyHost() != null) && (getHttpProxyPort() != null)
+}
+
 afterEvaluate { project ->
   uploadArchives {
     repositories {
@@ -37,9 +49,15 @@ afterEvaluate { project ->
 
         repository(url: getReleaseRepositoryUrl()) {
           authentication(userName: getRepositoryUsername(), password: getRepositoryPassword())
+          if (needProxy()) {
+            proxy(host: getHttpProxyHost(), port: getHttpProxyPort() as Integer, type: 'http')
+          }
         }
         snapshotRepository(url: getSnapshotRepositoryUrl()) {
           authentication(userName: getRepositoryUsername(), password: getRepositoryPassword())
+          if (needProxy()) {
+            proxy(host: getHttpProxyHost(), port: getHttpProxyPort() as Integer, type: 'http')
+          }
         }
 
         pom.project {

From ac0f18437f37ed0abd29524c35d00ee8a9c124ba Mon Sep 17 00:00:00 2001
From: Raghuraman Krishnamoorthi <raghuraman@fb.com>
Date: Fri, 4 Oct 2019 16:27:20 -0700
Subject: [PATCH 075/113] MovingAverage Observer (#27396)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27396

Observer that estimates moving averages of min and max values per batch,  more suited for quantization aware training instead of minmax observers that track extremal values across batches
ghstack-source-id: 91369018

Test Plan:
buck test caffe2/test:quantization -- 'test_per_tensor_observers \(test_quantization\.ObserverTest\)' --print-passing-details

buck test caffe2/test:quantization -- 'test_per_channel_observers \(test_quantization\.ObserverTest\)' --print-passing-details

Differential Revision: D17727213

fbshipit-source-id: 024a890bf3dd0bf269d8bfe61f19871d027326f0
---
 test/test_quantization.py           | 271 ++++++++++++++++------------
 torch/quantization/QConfig.py       |   4 +-
 torch/quantization/fake_quantize.py |  10 +-
 torch/quantization/observer.py      |  93 +++++++---
 4 files changed, 230 insertions(+), 148 deletions(-)

diff --git a/test/test_quantization.py b/test/test_quantization.py
index 6b737bcd9d7fc..d996ebd500283 100644
--- a/test/test_quantization.py
+++ b/test/test_quantization.py
@@ -10,7 +10,9 @@
     QConfig, QConfigDynamic, default_observer, default_weight_observer, get_observer_dict,\
     quantize, prepare, convert, prepare_qat, quantize_qat, fuse_modules, \
     quantize_dynamic, default_qconfig, default_debug_qconfig, default_qat_qconfig, \
-    default_dynamic_qconfig, HistogramObserver, MinMaxObserver, PerChannelMinMaxObserver, RecordingObserver, QuantWrapper
+    default_dynamic_qconfig, HistogramObserver, MinMaxObserver, PerChannelMinMaxObserver,\
+    RecordingObserver, MovingAverageMinMaxObserver, MovingAveragePerChannelMinMaxObserver, \
+    QuantWrapper
 
 from torch.quantization._quantize_script import quantize_script
 
@@ -932,144 +934,173 @@ class ObserverTest(QuantizationTestCase):
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
            qscheme=st.sampled_from((torch.per_tensor_affine, torch.per_tensor_symmetric)),
            reduce_range=st.booleans())
-    def test_minmax_observer(self, qdtype, qscheme, reduce_range):
+    def test_per_tensor_observers(self, qdtype, qscheme, reduce_range):
         # reduce_range cannot be true for symmetric quantization with uint8
         if qdtype == torch.quint8 and qscheme == torch.per_tensor_symmetric:
             reduce_range = False
-        myobs = MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range)
-        # Calculate Qparams should return with a warning for observers with no data
-        qparams = myobs.calculate_qparams()
-        x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0])
-        y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0])
-        result = myobs(x)
-        result = myobs(y)
-        self.assertEqual(result, y)
-        self.assertEqual(myobs.min_val, 1.0)
-        self.assertEqual(myobs.max_val, 8.0)
-        qparams = myobs.calculate_qparams()
-        if reduce_range:
-            if qscheme == torch.per_tensor_symmetric:
-                ref_scale = 0.062745 * 255 / 127
-                ref_zero_point = 0 if qdtype is torch.qint8 else 128
+        ObserverList = [MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range),
+                        MovingAverageMinMaxObserver(averaging_constant=0.5,
+                                                    dtype=qdtype,
+                                                    qscheme=qscheme,
+                                                    reduce_range=reduce_range)]
+        for myobs in ObserverList:
+            # Calculate Qparams should return with a warning for observers with no data
+            qparams = myobs.calculate_qparams()
+            if type(myobs) == MinMaxObserver:
+                x = torch.tensor([1.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+                y = torch.tensor([4.0, 5.0, 5.0, 6.0, 7.0, 8.0])
             else:
-                ref_scale = 0.0313725 * 255 / 127
-                ref_zero_point = -64 if qdtype is torch.qint8 else 0
-        else:
-            if qscheme == torch.per_tensor_symmetric:
-                ref_scale = 0.062745
-                ref_zero_point = 0 if qdtype is torch.qint8 else 128
+                # Moving average of min/max for x and y matches that of
+                # extreme values for x/y used for minmax observer
+                x = torch.tensor([0.0, 2.0, 2.0, 3.0, 4.0, 5.0, 6.0])
+                y = torch.tensor([2.0, 5.0, 5.0, 6.0, 7.0, 10.0])
+
+            result = myobs(x)
+            result = myobs(y)
+            self.assertEqual(result, y)
+            self.assertEqual(myobs.min_val, 1.0)
+            self.assertEqual(myobs.max_val, 8.0)
+            qparams = myobs.calculate_qparams()
+            if reduce_range:
+                if qscheme == torch.per_tensor_symmetric:
+                    ref_scale = 0.062745 * 255 / 127
+                    ref_zero_point = 0 if qdtype is torch.qint8 else 128
+                else:
+                    ref_scale = 0.0313725 * 255 / 127
+                    ref_zero_point = -64 if qdtype is torch.qint8 else 0
             else:
-                ref_scale = 0.0313725
-                ref_zero_point = -128 if qdtype is torch.qint8 else 0
-        self.assertEqual(qparams[1].item(), ref_zero_point)
-        self.assertAlmostEqual(qparams[0].item(), ref_scale, delta=1e-5)
-
-        # Test for serializability
-        state_dict = myobs.state_dict()
-        b = io.BytesIO()
-        torch.save(state_dict, b)
-        b.seek(0)
-        loaded_dict = torch.load(b)
-        for key in state_dict:
-            self.assertEqual(state_dict[key], loaded_dict[key])
-        loaded_obs = MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range)
-        loaded_obs.load_state_dict(loaded_dict)
-        loaded_qparams = loaded_obs.calculate_qparams()
-        self.assertEqual(myobs.min_val, loaded_obs.min_val)
-        self.assertEqual(myobs.max_val, loaded_obs.max_val)
-        self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
+                if qscheme == torch.per_tensor_symmetric:
+                    ref_scale = 0.062745
+                    ref_zero_point = 0 if qdtype is torch.qint8 else 128
+                else:
+                    ref_scale = 0.0313725
+                    ref_zero_point = -128 if qdtype is torch.qint8 else 0
+            self.assertEqual(qparams[1].item(), ref_zero_point)
+            self.assertAlmostEqual(qparams[0].item(), ref_scale, delta=1e-5)
+            state_dict = myobs.state_dict()
+            b = io.BytesIO()
+            torch.save(state_dict, b)
+            b.seek(0)
+            loaded_dict = torch.load(b)
+            for key in state_dict:
+                self.assertEqual(state_dict[key], loaded_dict[key])
+            loaded_obs = MinMaxObserver(dtype=qdtype, qscheme=qscheme, reduce_range=reduce_range)
+            loaded_obs.load_state_dict(loaded_dict)
+            loaded_qparams = loaded_obs.calculate_qparams()
+            self.assertEqual(myobs.min_val, loaded_obs.min_val)
+            self.assertEqual(myobs.max_val, loaded_obs.max_val)
+            self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
 
     @given(qdtype=st.sampled_from((torch.qint8, torch.quint8)),
            qscheme=st.sampled_from((torch.per_channel_affine, torch.per_channel_symmetric)),
            ch_axis=st.sampled_from((0, 1, 2, 3)), reduce_range=st.booleans())
-    def test_per_channel_minmax_observer(self, qdtype, qscheme, ch_axis, reduce_range):
+    def test_per_channel_observers(self, qdtype, qscheme, ch_axis, reduce_range):
         # reduce_range cannot be true for symmetric quantization with uint8
         if qdtype == torch.quint8 and qscheme == torch.per_channel_symmetric:
             reduce_range = False
-        myobs = PerChannelMinMaxObserver(reduce_range=reduce_range, ch_axis=ch_axis, dtype=qdtype, qscheme=qscheme)
-        # Calculate qparams should work for empty observers
-        qparams = myobs.calculate_qparams()
-        x = torch.tensor(
-            [
-                [[[1.0, 2.0], [2.0, 2.5]], [[3.0, 4.0], [4.5, 6.0]]],
-                [[[-4.0, -3.0], [5.0, 5.0]], [[6.0, 3.0], [7.0, 8.0]]],
-            ]
-        )
-        result = myobs(x)
-        self.assertEqual(result, x)
-        qparams = myobs.calculate_qparams()
-        ref_min_vals = [[1.0, -4.0], [-4.0, 3.0], [-4.0, 2.0], [-4.0, -3.0]]
-        ref_max_vals = [[6.0, 8.0], [5.0, 8.0], [6.0, 8.0], [7.0, 8.0]]
-        per_channel_symmetric_ref_scales = [
-            [0.04705882, 0.06274509],
-            [0.03921569, 0.0627451],
-            [0.04705882, 0.0627451],
-            [0.05490196, 0.0627451],
-        ]
-        per_channel_affine_ref_scales = [
-            [0.02352941, 0.04705882],
-            [0.03529412, 0.03137255],
-            [0.03921569, 0.03137255],
-            [0.04313726, 0.04313726],
-        ]
-        per_channel_affine_qint8_zp = [
-            [-128, -43],
-            [-15, -128],
-            [-26, -128],
-            [-35, -58],
-        ]
-        per_channel_affine_quint8_zp = [[0, 85], [113, 0], [102, 0], [93, 70]]
-
-        self.assertEqual(myobs.min_vals, ref_min_vals[ch_axis])
-        self.assertEqual(myobs.max_vals, ref_max_vals[ch_axis])
-        if qscheme == torch.per_channel_symmetric:
-            ref_scales = per_channel_symmetric_ref_scales[ch_axis]
-            ref_zero_points = [0, 0] if qdtype is torch.qint8 else [128, 128]
-        else:
-            ref_scales = per_channel_affine_ref_scales[ch_axis]
-            ref_zero_points = (
-                per_channel_affine_qint8_zp[ch_axis]
-                if qdtype is torch.qint8
-                else per_channel_affine_quint8_zp[ch_axis]
+        ObserverList = [PerChannelMinMaxObserver(reduce_range=reduce_range,
+                                                 ch_axis=ch_axis,
+                                                 dtype=qdtype,
+                                                 qscheme=qscheme),
+                        MovingAveragePerChannelMinMaxObserver(averaging_constant=0.5,
+                                                              reduce_range=reduce_range,
+                                                              ch_axis=ch_axis,
+                                                              dtype=qdtype,
+                                                              qscheme=qscheme)]
+
+        for myobs in ObserverList:
+            # Calculate qparams should work for empty observers
+            qparams = myobs.calculate_qparams()
+            x = torch.tensor(
+                [
+                    [[[1.0, 2.0], [2.0, 2.5]], [[3.0, 4.0], [4.5, 6.0]]],
+                    [[[-4.0, -3.0], [5.0, 5.0]], [[6.0, 3.0], [7.0, 8.0]]],
+                ]
             )
+            if type(myobs) == MovingAveragePerChannelMinMaxObserver:
+                # Scaling the input tensor to model change in min/max values
+                # across batches
+                result = myobs(0.5 * x)
+                result = myobs(1.5 * x)
+                self.assertEqual(result, 1.5 * x)
+            else:
+                result = myobs(x)
+                self.assertEqual(result, x)
+
+            qparams = myobs.calculate_qparams()
+            ref_min_vals = [[1.0, -4.0], [-4.0, 3.0], [-4.0, 2.0], [-4.0, -3.0]]
+            ref_max_vals = [[6.0, 8.0], [5.0, 8.0], [6.0, 8.0], [7.0, 8.0]]
+            per_channel_symmetric_ref_scales = [
+                [0.04705882, 0.06274509],
+                [0.03921569, 0.0627451],
+                [0.04705882, 0.0627451],
+                [0.05490196, 0.0627451],
+            ]
+            per_channel_affine_ref_scales = [
+                [0.02352941, 0.04705882],
+                [0.03529412, 0.03137255],
+                [0.03921569, 0.03137255],
+                [0.04313726, 0.04313726],
+            ]
+            per_channel_affine_qint8_zp = [
+                [-128, -43],
+                [-15, -128],
+                [-26, -128],
+                [-35, -58],
+            ]
+            per_channel_affine_quint8_zp = [[0, 85], [113, 0], [102, 0], [93, 70]]
 
-        if reduce_range:
-            ref_scales = [s * 255 / 127 for s in ref_scales]
-            ref_zero_points = [math.floor(z / 2) for z in ref_zero_points]
-
-        self.assertTrue(torch.allclose(qparams[0], torch.tensor(ref_scales, dtype=qparams[0].dtype)))
-        self.assertTrue(torch.allclose(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype)))
-
-        # Test for serializability
-        state_dict = myobs.state_dict()
-        b = io.BytesIO()
-        torch.save(state_dict, b)
-        b.seek(0)
-        loaded_dict = torch.load(b)
-        for key in state_dict:
-            self.assertEqual(state_dict[key], loaded_dict[key])
-        loaded_obs = PerChannelMinMaxObserver(reduce_range=reduce_range, ch_axis=ch_axis, dtype=qdtype, qscheme=qscheme)
-        loaded_obs.load_state_dict(loaded_dict)
-        loaded_qparams = loaded_obs.calculate_qparams()
-        self.assertEqual(myobs.min_vals, loaded_obs.min_vals)
-        self.assertEqual(myobs.max_vals, loaded_obs.max_vals)
-        self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
+            self.assertEqual(myobs.min_vals, ref_min_vals[ch_axis])
+            self.assertEqual(myobs.max_vals, ref_max_vals[ch_axis])
+            if qscheme == torch.per_channel_symmetric:
+                ref_scales = per_channel_symmetric_ref_scales[ch_axis]
+                ref_zero_points = [0, 0] if qdtype is torch.qint8 else [128, 128]
+            else:
+                ref_scales = per_channel_affine_ref_scales[ch_axis]
+                ref_zero_points = (
+                    per_channel_affine_qint8_zp[ch_axis]
+                    if qdtype is torch.qint8
+                    else per_channel_affine_quint8_zp[ch_axis]
+                )
+
+            if reduce_range:
+                ref_scales = [s * 255 / 127 for s in ref_scales]
+                ref_zero_points = [math.floor(z / 2) for z in ref_zero_points]
+
+            self.assertTrue(torch.allclose(qparams[0], torch.tensor(ref_scales, dtype=qparams[0].dtype)))
+            self.assertTrue(torch.allclose(qparams[1], torch.tensor(ref_zero_points, dtype=qparams[1].dtype)))
+
+            # Test for serializability
+            state_dict = myobs.state_dict()
+            b = io.BytesIO()
+            torch.save(state_dict, b)
+            b.seek(0)
+            loaded_dict = torch.load(b)
+            for key in state_dict:
+                self.assertEqual(state_dict[key], loaded_dict[key])
+            loaded_obs = PerChannelMinMaxObserver(reduce_range=reduce_range, ch_axis=ch_axis, dtype=qdtype, qscheme=qscheme)
+            loaded_obs.load_state_dict(loaded_dict)
+            loaded_qparams = loaded_obs.calculate_qparams()
+            self.assertEqual(myobs.min_vals, loaded_obs.min_vals)
+            self.assertEqual(myobs.max_vals, loaded_obs.max_vals)
+            self.assertEqual(myobs.calculate_qparams(), loaded_obs.calculate_qparams())
 
     def test_observer_scriptable(self):
-        obs = torch.quantization.default_observer()
-        scripted = torch.jit.script(obs)
+        obs_list = [MinMaxObserver(), MovingAverageMinMaxObserver()]
+        for obs in obs_list:
+            scripted = torch.jit.script(obs)
 
-        x = torch.rand(3, 4)
-        obs(x)
-        scripted(x)
+            x = torch.rand(3, 4)
+            obs(x)
+            scripted(x)
 
-        self.assertEqual(obs.calculate_qparams(), scripted.calculate_qparams())
+            self.assertEqual(obs.calculate_qparams(), scripted.calculate_qparams())
 
-        buf = io.BytesIO()
-        torch.jit.save(scripted, buf)
-        buf.seek(0)
-        loaded = torch.jit.load(buf)
-        self.assertEqual(obs.calculate_qparams(), loaded.calculate_qparams())
+            buf = io.BytesIO()
+            torch.jit.save(scripted, buf)
+            buf.seek(0)
+            loaded = torch.jit.load(buf)
+            self.assertEqual(obs.calculate_qparams(), loaded.calculate_qparams())
 
     def test_no_qconfig_propagation(self):
         model = ModelWithNoQconfigPropagation()
diff --git a/torch/quantization/QConfig.py b/torch/quantization/QConfig.py
index 0f52e0908ee7d..d1a9230703de7 100644
--- a/torch/quantization/QConfig.py
+++ b/torch/quantization/QConfig.py
@@ -86,13 +86,13 @@ def get_default_qconfig(backend='fbgemm'):
 def get_default_qat_qconfig(backend='fbgemm'):
     # Histogram observer is too slow for quantization aware training
     if backend == 'fbgemm':
-        qconfig = QConfig(activation=FakeQuantize.with_args(observer=MinMaxObserver,
+        qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
                                                             quant_min=0,
                                                             quant_max=255,
                                                             reduce_range=True),
                           weight=default_per_channel_weight_fake_quant)
     elif backend == 'qnnpack':
-        qconfig = QConfig(activation=FakeQuantize.with_args(observer=MinMaxObserver,
+        qconfig = QConfig(activation=FakeQuantize.with_args(observer=MovingAverageMinMaxObserver,
                                                             quant_min=0,
                                                             quant_max=255,
                                                             reduce_range=False),
diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 2dc6d12ed8666..7f3924e715ee4 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import torch
 from torch.nn import Module
-from .observer import MinMaxObserver, HistogramObserver, PerChannelMinMaxObserver, _with_args
+from .observer import MovingAverageMinMaxObserver, HistogramObserver, MovingAveragePerChannelMinMaxObserver, _with_args
 
 class FakeQuantize(Module):
     ''' Simulate the quantize and dequantize operations in training time.
@@ -48,7 +48,7 @@ class FakeQuantize(Module):
         given the stats
         `observer_kwargs`
     '''
-    def __init__(self, observer=MinMaxObserver, quant_min=0, quant_max=255, **observer_kwargs):
+    def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, **observer_kwargs):
         super(FakeQuantize, self).__init__()
         assert quant_min <= quant_max, \
             'quant_min must be less than or equal to quant_max'
@@ -119,12 +119,12 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         super(FakeQuantize, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
                                                         missing_keys, unexpected_keys, error_msgs)
 
-default_fake_quant = FakeQuantize.with_args(observer=MinMaxObserver, quant_min=0, quant_max=255,
+default_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255,
                                             dtype=torch.quint8, qscheme=torch.per_tensor_affine, reduce_range=True)
-default_weight_fake_quant = FakeQuantize.with_args(observer=MinMaxObserver, quant_min=-128, quant_max=127,
+default_weight_fake_quant = FakeQuantize.with_args(observer=MovingAverageMinMaxObserver, quant_min=-128, quant_max=127,
                                                    dtype=torch.qint8, qscheme=torch.per_tensor_symmetric, reduce_range=False)
 
-default_per_channel_weight_fake_quant = FakeQuantize.with_args(observer=PerChannelMinMaxObserver,
+default_per_channel_weight_fake_quant = FakeQuantize.with_args(observer=MovingAveragePerChannelMinMaxObserver,
                                                                quant_min=-128,
                                                                quant_max=127,
                                                                dtype=torch.qint8,
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 83552d132f6d8..46800dd997438 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -215,7 +215,7 @@ def forward(self, x_orig):
             max_val = torch.max(torch.max(x), max_val)
         self.min_val = min_val
         self.max_val = max_val
-        return x
+        return x_orig
 
     @torch.jit.export
     def calculate_qparams(self):
@@ -238,6 +238,25 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         super(MinMaxObserver, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
                                                           missing_keys, unexpected_keys, error_msgs)
 
+class MovingAverageMinMaxObserver(MinMaxObserver):
+    def __init__(self, averaging_constant=0.01, **kwargs):
+        self.averaging_constant = averaging_constant
+        super(MovingAverageMinMaxObserver, self).__init__(**kwargs)
+
+    def forward(self, x_orig):
+        x = x_orig.detach()  # avoid keeping autograd tape
+        min_val = self.min_val
+        max_val = self.max_val
+        if min_val is None or max_val is None:
+            min_val = torch.min(x)
+            max_val = torch.max(x)
+        else:
+            min_val = min_val + self.averaging_constant * (torch.min(x) - min_val)
+            max_val = max_val + self.averaging_constant * (torch.max(x) - max_val)
+        self.min_val = min_val
+        self.max_val = max_val
+        return x_orig
+
 
 class PerChannelMinMaxObserver(_ObserverBase):
     r"""Per Channel Observer Module
@@ -260,26 +279,26 @@ def __init__(self, ch_axis=0, **kwargs):
                 "Cannot reduce range for symmetric quantization for quint8"
             )
 
-    def forward(self, x):
-        with torch.no_grad():
-            min_vals = self.min_vals
-            max_vals = self.max_vals
-            x_dim = x.size()
-
-            new_axis_list = list(range(len(x_dim)))
-            new_axis_list[self.ch_axis] = 0
-            new_axis_list[0] = self.ch_axis
-            y = x.permute(tuple(new_axis_list))
-            y = torch.flatten(y, start_dim=1)
-            if min_vals is None or max_vals is None:
-                min_vals = torch.min(y, 1)[0]
-                max_vals = torch.max(y, 1)[0]
-            else:
-                min_vals = torch.min(torch.min(y, 1)[0], min_vals)
-                max_vals = torch.max(torch.max(y, 1)[0], max_vals)
-            self.min_vals = min_vals
-            self.max_vals = max_vals
-        return x
+    def forward(self, x_orig):
+        x = x_orig.detach()  # avoid keeping autograd tape
+        min_vals = self.min_vals
+        max_vals = self.max_vals
+        x_dim = x.size()
+
+        new_axis_list = list(range(len(x_dim)))
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x.permute(tuple(new_axis_list))
+        y = torch.flatten(y, start_dim=1)
+        if min_vals is None or max_vals is None:
+            min_vals = torch.min(y, 1)[0]
+            max_vals = torch.max(y, 1)[0]
+        else:
+            min_vals = torch.min(torch.min(y, 1)[0], min_vals)
+            max_vals = torch.max(torch.max(y, 1)[0], max_vals)
+        self.min_vals = min_vals
+        self.max_vals = max_vals
+        return x_orig
 
     def calculate_qparams(self):
         return self._calculate_per_channel_qparams(self.min_vals, self.max_vals)
@@ -296,6 +315,38 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
         super(PerChannelMinMaxObserver, self)._load_from_state_dict(state_dict, prefix, local_metadata, False,
                                                                     missing_keys, unexpected_keys, error_msgs)
 
+class MovingAveragePerChannelMinMaxObserver(PerChannelMinMaxObserver):
+    r"""Per Channel Observer Module
+    The module will record the running average of max and min value for each
+    channel of the observed Tensor and calculate_qparams will calculate
+    scales and zero_points for each channel
+    """
+
+    def __init__(self, averaging_constant=0.01, **kwargs):
+        self.averaging_constant = averaging_constant
+        super(MovingAveragePerChannelMinMaxObserver, self).__init__(**kwargs)
+
+    def forward(self, x_orig):
+        x = x_orig.detach()  # avoid keeping autograd tape
+        min_vals = self.min_vals
+        max_vals = self.max_vals
+        x_dim = x.size()
+
+        new_axis_list = list(range(len(x_dim)))
+        new_axis_list[self.ch_axis] = 0
+        new_axis_list[0] = self.ch_axis
+        y = x.permute(tuple(new_axis_list))
+        y = torch.flatten(y, start_dim=1)
+        if min_vals is None or max_vals is None:
+            min_vals = torch.min(y, 1)[0]
+            max_vals = torch.max(y, 1)[0]
+        else:
+            min_vals = min_vals + self.averaging_constant * (torch.min(y, 1)[0] - min_vals)
+            max_vals = max_vals + self.averaging_constant * (torch.max(y, 1)[0] - max_vals)
+        self.min_vals = min_vals
+        self.max_vals = max_vals
+        return x_orig
+
 class HistogramObserver(_ObserverBase):
     r"""
     The module records the running histogram of tensor values along with

From afbbe16f495b205b098a7dd570677d14d014e8f7 Mon Sep 17 00:00:00 2001
From: Ivan Kobzarev <ivankobzarev@fb.com>
Date: Fri, 4 Oct 2019 16:31:51 -0700
Subject: [PATCH 076/113] Add methods to write image tensor content to buffer
 (#27359)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27359

Adding methods  to TensorImageUtils:
```
bitmapToFloatBuffer(..., FloatBuffer outBuffer, int outBufferOffset)
imageYUV420CenterCropToFloat32Tensor(..., FloatBuffer outBuffer, int outBufferOffset)
```
To be able to
 - reuse FloatBuffer for inference
 - to create batch-Tensor (contains several images/bitmaps)

As we reuse FloatBuffer for example demo app - image classification,
profiler shows less memory allocations (before that for every run we created new input tensor with newly allocated FloatBuffer) and ~-20ms on my PixelXL

Known open question:
At the moment every tensor element is written separatly calling `outBuffer.put()`, which is native call crossing lang boundaries
As an alternative - to allocation `float[]` on java side and fill it and put it in `outBuffer` with one call, reducing native calls, but increasing memory allocation on java side.
Tested locally just eyeballing durations - have not noticed big difference - decided to go with less memory allocations.

Will be good to merge into 1.3.0, but if not - demo app can use snapshot dependencies with this change.

PR with integration to demo app:
https://github.com/pytorch/android-demo-app/pull/6

Test Plan: Imported from OSS

Differential Revision: D17758621

Pulled By: IvanKobzarev

fbshipit-source-id: b4f1a068789279002d7ecc0bc680111f781bf980
---
 .../pytorch/torchvision/TensorImageUtils.java | 130 +++++++++++++++---
 1 file changed, 109 insertions(+), 21 deletions(-)

diff --git a/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java b/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java
index 5f512e8193852..4c4258b4a4784 100644
--- a/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java
+++ b/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java
@@ -7,6 +7,7 @@
 import org.pytorch.Tensor;
 
 import java.nio.ByteBuffer;
+import java.nio.FloatBuffer;
 import java.util.Locale;
 
 /**
@@ -26,7 +27,7 @@ public final class TensorImageUtils {
    * @param normStdRGB  standard deviation for RGB channels normalization, length must equal 3, RGB order
    */
   public static Tensor bitmapToFloat32Tensor(
-      final Bitmap bitmap, float[] normMeanRGB, float normStdRGB[]) {
+      final Bitmap bitmap, final float[] normMeanRGB, final float normStdRGB[]) {
     checkNormMeanArg(normMeanRGB);
     checkNormStdArg(normStdRGB);
 
@@ -35,8 +36,9 @@ public static Tensor bitmapToFloat32Tensor(
   }
 
   /**
-   * Creates new {@link org.pytorch.Tensor} from specified area of {@link android.graphics.Bitmap},
-   * normalized with specified in parameters mean and std.
+   * Writes tensor content from specified {@link android.graphics.Bitmap},
+   * normalized with specified in parameters mean and std to specified {@link java.nio.FloatBuffer}
+   * with specified offset.
    *
    * @param bitmap      {@link android.graphics.Bitmap} as a source for Tensor data
    * @param x           - x coordinate of top left corner of bitmap's area
@@ -46,21 +48,23 @@ public static Tensor bitmapToFloat32Tensor(
    * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
    * @param normStdRGB  standard deviation for RGB channels normalization, length must equal 3, RGB order
    */
-  public static Tensor bitmapToFloat32Tensor(
+  public static void bitmapToFloatBuffer(
       final Bitmap bitmap,
-      int x,
-      int y,
-      int width,
-      int height,
-      float[] normMeanRGB,
-      float[] normStdRGB) {
+      final int x,
+      final int y,
+      final int width,
+      final int height,
+      final float[] normMeanRGB,
+      final float[] normStdRGB,
+      final FloatBuffer outBuffer,
+      final int outBufferOffset) {
+    checkOutBufferCapacity(outBuffer, outBufferOffset, width, height);
     checkNormMeanArg(normMeanRGB);
     checkNormStdArg(normStdRGB);
 
     final int pixelsCount = height * width;
     final int[] pixels = new int[pixelsCount];
     bitmap.getPixels(pixels, 0, width, x, y, width, height);
-    final float[] floatArray = new float[3 * pixelsCount];
     final int offset_g = pixelsCount;
     final int offset_b = 2 * pixelsCount;
     for (int i = 0; i < pixelsCount; i++) {
@@ -68,11 +72,41 @@ public static Tensor bitmapToFloat32Tensor(
       float r = ((c >> 16) & 0xff) / 255.0f;
       float g = ((c >> 8) & 0xff) / 255.0f;
       float b = ((c) & 0xff) / 255.0f;
-      floatArray[i] = (r - normMeanRGB[0]) / normStdRGB[0];
-      floatArray[offset_g + i] = (g - normMeanRGB[1]) / normStdRGB[1];
-      floatArray[offset_b + i] = (b - normMeanRGB[2]) / normStdRGB[2];
+      float rF = (r - normMeanRGB[0]) / normStdRGB[0];
+      float gF = (g - normMeanRGB[1]) / normStdRGB[1];
+      float bF = (b - normMeanRGB[2]) / normStdRGB[2];
+      outBuffer.put(outBufferOffset + i, rF);
+      outBuffer.put(outBufferOffset + offset_g + i, gF);
+      outBuffer.put(outBufferOffset + offset_b + i, bF);
     }
-    return Tensor.newFloat32Tensor(new long[]{1, 3, height, width}, floatArray);
+  }
+
+  /**
+   * Creates new {@link org.pytorch.Tensor} from specified area of {@link android.graphics.Bitmap},
+   * normalized with specified in parameters mean and std.
+   *
+   * @param bitmap      {@link android.graphics.Bitmap} as a source for Tensor data
+   * @param x           - x coordinate of top left corner of bitmap's area
+   * @param y           - y coordinate of top left corner of bitmap's area
+   * @param width       - width of bitmap's area
+   * @param height      - height of bitmap's area
+   * @param normMeanRGB means for RGB channels normalization, length must equal 3, RGB order
+   * @param normStdRGB  standard deviation for RGB channels normalization, length must equal 3, RGB order
+   */
+  public static Tensor bitmapToFloat32Tensor(
+      final Bitmap bitmap,
+      int x,
+      int y,
+      int width,
+      int height,
+      float[] normMeanRGB,
+      float[] normStdRGB) {
+    checkNormMeanArg(normMeanRGB);
+    checkNormStdArg(normStdRGB);
+
+    final FloatBuffer floatBuffer = Tensor.allocateFloatBuffer(3 * width * height);
+    bitmapToFloatBuffer(bitmap, x, y, width, height, normMeanRGB, normStdRGB, floatBuffer, 0);
+    return Tensor.newFloat32Tensor(new long[]{1, 3, height, width}, floatBuffer);
   }
 
   /**
@@ -105,6 +139,52 @@ public static Tensor imageYUV420CenterCropToFloat32Tensor(
     checkRotateCWDegrees(rotateCWDegrees);
     checkTensorSize(tensorWidth, tensorHeight);
 
+    final FloatBuffer floatBuffer = Tensor.allocateFloatBuffer(3 * tensorWidth * tensorHeight);
+    imageYUV420CenterCropToFloatBuffer(
+        image,
+        rotateCWDegrees,
+        tensorWidth,
+        tensorHeight,
+        normMeanRGB, normStdRGB, floatBuffer, 0);
+    return Tensor.newFloat32Tensor(new long[]{1, 3, tensorHeight, tensorWidth}, floatBuffer);
+  }
+
+  /**
+   * Writes tensor content from specified {@link android.media.Image}, doing optional rotation,
+   * scaling (nearest) and center cropping to specified {@link java.nio.FloatBuffer} with specified offset.
+   *
+   * @param image           {@link android.media.Image} as a source for Tensor data
+   * @param rotateCWDegrees Clockwise angle through which the input image needs to be rotated to be
+   *                        upright. Range of valid values: 0, 90, 180, 270
+   * @param tensorWidth     return tensor width, must be positive
+   * @param tensorHeight    return tensor height, must be positive
+   * @param normMeanRGB     means for RGB channels normalization, length must equal 3, RGB order
+   * @param normStdRGB      standard deviation for RGB channels normalization, length must equal 3, RGB order
+   * @param outBuffer       Output buffer, where tensor content will be written
+   * @param outBufferOffset Output buffer offset with which tensor content will be written
+   */
+  public static void imageYUV420CenterCropToFloatBuffer(
+      final Image image,
+      int rotateCWDegrees,
+      final int tensorWidth,
+      final int tensorHeight,
+      float[] normMeanRGB,
+      float[] normStdRGB,
+      final FloatBuffer outBuffer,
+      final int outBufferOffset) {
+    checkOutBufferCapacity(outBuffer, outBufferOffset, tensorWidth, tensorHeight);
+
+    if (image.getFormat() != ImageFormat.YUV_420_888) {
+      throw new IllegalArgumentException(
+          String.format(
+              Locale.US, "Image format %d != ImageFormat.YUV_420_888", image.getFormat()));
+    }
+
+    checkNormMeanArg(normMeanRGB);
+    checkNormStdArg(normStdRGB);
+    checkRotateCWDegrees(rotateCWDegrees);
+    checkTensorSize(tensorWidth, tensorHeight);
+
     final int widthBeforeRotation = image.getWidth();
     final int heightBeforeRotation = image.getHeight();
 
@@ -158,7 +238,6 @@ public static Tensor imageYUV420CenterCropToFloat32Tensor(
     final int channelSize = tensorHeight * tensorWidth;
     final int tensorInputOffsetG = channelSize;
     final int tensorInputOffsetB = 2 * channelSize;
-    final float[] floatArray = new float[3 * channelSize];
     for (int x = 0; x < tensorWidth; x++) {
       for (int y = 0; y < tensorHeight; y++) {
 
@@ -198,13 +277,22 @@ public static Tensor imageYUV420CenterCropToFloat32Tensor(
         int r = clamp((a0 + a1) >> 10, 0, 255);
         int g = clamp((a0 - a2 - a3) >> 10, 0, 255);
         int b = clamp((a0 + a4) >> 10, 0, 255);
-        final int offset = y * tensorWidth + x;
-        floatArray[offset] = ((r / 255.f) - normMeanRGB[0]) / normStdRGB[0];
-        floatArray[tensorInputOffsetG + offset] = ((g / 255.f) - normMeanRGB[1]) / normStdRGB[1];
-        floatArray[tensorInputOffsetB + offset] = ((b / 255.f) - normMeanRGB[2]) / normStdRGB[2];
+        final int offset = outBufferOffset + y * tensorWidth + x;
+        float rF = ((r / 255.f) - normMeanRGB[0]) / normStdRGB[0];
+        float gF = ((g / 255.f) - normMeanRGB[1]) / normStdRGB[1];
+        float bF = ((b / 255.f) - normMeanRGB[2]) / normStdRGB[2];
+
+        outBuffer.put(offset, rF);
+        outBuffer.put(offset + tensorInputOffsetG, gF);
+        outBuffer.put(offset + tensorInputOffsetB, bF);
       }
     }
-    return Tensor.newFloat32Tensor(new long[]{1, 3, tensorHeight, tensorWidth}, floatArray);
+  }
+
+  private static void checkOutBufferCapacity(FloatBuffer outBuffer, int outBufferOffset, int tensorWidth, int tensorHeight) {
+    if (outBufferOffset + 3 * tensorWidth * tensorHeight > outBuffer.capacity()) {
+      throw new IllegalStateException("Buffer underflow");
+    }
   }
 
   private static void checkTensorSize(int tensorWidth, int tensorHeight) {

From 381cf2bd247fc6f92c1adf6821db888c76e2a62f Mon Sep 17 00:00:00 2001
From: Amy Yang <amyyang@fb.com>
Date: Fri, 4 Oct 2019 17:01:53 -0700
Subject: [PATCH 077/113] add warning to dnnlowp fc if quantization kind is not
 min_max

Summary:
Print warning when using DNNLOWP dynamic int8 quant for FC and activation_quantization_kind != min_max.

Warning will display in console but not in Bento. Would have to use CAFFE_ENFORCE to alert in Bento.

Test Plan: buck run unit test forcing DNNLOWP FC with activation_quantization_kind = "l2" and saw warning printed in console.

Reviewed By: csummersea

Differential Revision: D17770921

fbshipit-source-id: b6532e4c9a86d74e3db4cb432735505d378a366e
---
 .../quantization/server/caffe2_dnnlowp_utils.cc   | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/caffe2/quantization/server/caffe2_dnnlowp_utils.cc b/caffe2/quantization/server/caffe2_dnnlowp_utils.cc
index 4e0e4cecbd40b..af671a0f596eb 100644
--- a/caffe2/quantization/server/caffe2_dnnlowp_utils.cc
+++ b/caffe2/quantization/server/caffe2_dnnlowp_utils.cc
@@ -71,7 +71,20 @@ TensorQuantizationParams GetInputTensorQuantizationParamsOf(
     float min, max;
     fbgemm::FindMinMax(
         tensor->template data<float>(), &min, &max, tensor->numel());
-
+    auto activation_quantization_kind = qfactory->GetActivationKind();
+    if (activation_quantization_kind !=
+        QuantizationFactory::QuantizationKind::MIN_MAX_QUANTIZATION) {
+      LOG(WARNING)
+          << "DNNLOWP dynamic int8 FC uses min_max as the only activation_quantization kind. Qparams will be assigned based on min_max regardless of activation_quantization_kind args.";
+    }
+    if (is_weight) {
+      auto weight_quantization_kind = qfactory->GetWeightKind();
+      if (weight_quantization_kind !=
+          QuantizationFactory::QuantizationKind::MIN_MAX_QUANTIZATION) {
+        LOG(WARNING)
+            << "DNNLOWP dynamic int8 FC weight is not constant, assigning qparams to weight based on min_max, regardless of weight_quantization_kind args.";
+      }
+    }
     return qfactory->ChooseQuantizationParams(min, max, is_weight);
   }
 }

From b05ec828ad419ba9a4b09b1640a5c989266cd598 Mon Sep 17 00:00:00 2001
From: Wanchao Liang <wanchaol@fb.com>
Date: Fri, 4 Oct 2019 17:10:38 -0700
Subject: [PATCH 078/113] Add interface/object serialization as module
 attribute (#26770)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26770

This PR added the interface/object serialization as module attribute, to
allow initializing object as a interface type during python
initialization. Because interface type can be backed by any class object
that implements that interface, if we declare it in
python/module.__init__, we will need to collect the run time types of the
value and serialize them to ensure complete code information

Test Plan: Imported from OSS

Differential Revision: D17742707

fbshipit-source-id: 7f614ad4f982996d320a0e2dd3515bf47370e730
---
 test/test_jit.py           | 56 +++++++++++++++++++-------------------
 torch/csrc/jit/export.cpp  | 16 ++++++++---
 torch/csrc/jit/pickler.cpp |  6 ++++
 torch/csrc/jit/pickler.h   | 11 ++++++--
 4 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 9b9133741ea56..6da43694aca11 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -19720,43 +19720,43 @@ def wrong4(x):
                 return as_interface(x)
 
         # Test interface/class python assignment
-        with torch.jit._disable_emit_hooks():
-            class TestPyAssign(nn.Module):
-                def __init__(self):
-                    super(TestPyAssign, self).__init__()
-                    self.proxy_mod = Foo()
+        class TestPyAssign(nn.Module):
+            def __init__(self):
+                super(TestPyAssign, self).__init__()
+                self.proxy_mod = Foo()
 
-                def forward(self, x):
-                    return self.proxy_mod.two(x)
+            def forward(self, x):
+                return self.proxy_mod.two(x)
 
-            TestPyAssign.__annotations__ = {'proxy_mod': OneTwo}
+        TestPyAssign.__annotations__ = {'proxy_mod': OneTwo}
 
-            input = torch.rand(3, 4)
-            scripted_pyassign_mod = torch.jit.script(TestPyAssign())
-            self.assertEqual(scripted_pyassign_mod(input), 2 * input)
+        input = torch.rand(3, 4)
+        scripted_pyassign_mod = torch.jit.script(TestPyAssign())
+        imported_mod = self.getExportImportCopy(scripted_pyassign_mod)
+        self.assertEqual(scripted_pyassign_mod(input), imported_mod(input))
 
-            class TestPyAssignError(nn.Module):
-                def __init__(self, obj):
-                    super(TestPyAssignError, self).__init__()
-                    self.proxy_mod = obj
+        class TestPyAssignError(nn.Module):
+            def __init__(self, obj):
+                super(TestPyAssignError, self).__init__()
+                self.proxy_mod = obj
 
-                def forward(self, x):
-                    return self.proxy_mod.two(x)
+            def forward(self, x):
+                return self.proxy_mod.two(x)
 
-            TestPyAssignError.__annotations__ = {'proxy_mod': OneTwoThree}
+        TestPyAssignError.__annotations__ = {'proxy_mod': OneTwoThree}
 
-            with self.assertRaisesRegex(RuntimeError,
-                                        "is not compatible with interface __torch__"):
-                torch.jit.script(TestPyAssignError(Foo()))
+        with self.assertRaisesRegex(RuntimeError,
+                                    "is not compatible with interface __torch__"):
+            torch.jit.script(TestPyAssignError(Foo()))
 
-            # test pure python object assignment to interface fails
-            class PyClass(object):
-                def __init__(self):
-                    pass
+        # test pure python object assignment to interface fails
+        class PyClass(object):
+            def __init__(self):
+                pass
 
-            with self.assertRaisesRegex(RuntimeError,
-                                        "the value is not a TorchScript compatible type"):
-                torch.jit.script(TestPyAssignError(PyClass()))
+        with self.assertRaisesRegex(RuntimeError,
+                                    "the value is not a TorchScript compatible type"):
+            torch.jit.script(TestPyAssignError(PyClass()))
         # TODO test: interface-interface class-interface inheritance errors,
         # NamedTuple inheritance errors
 
diff --git a/torch/csrc/jit/export.cpp b/torch/csrc/jit/export.cpp
index 1757c42b06a41..f8d1daed1d6da 100644
--- a/torch/csrc/jit/export.cpp
+++ b/torch/csrc/jit/export.cpp
@@ -543,7 +543,9 @@ class ScriptModuleSerializer {
       bool bytecode_format) {
     C10_LOG_API_USAGE_ONCE("torch.script.save");
     writeExtraFiles(module, extra_files);
-    // Serialize all code info.
+    // Serialize the model object
+    writeArchive("data", module.module_object());
+    // Then we werialize all code info.
     writeCode(module.type());
     // The tensor constants from the code are written to a separate archive
     // so loading the code does not depend on loading the data
@@ -553,18 +555,19 @@ class ScriptModuleSerializer {
     if (bytecode_format) {
       writeByteCode(module);
     }
-    // finally we serialize the model
-    writeArchive("data", module.module_object());
   }
 
  private:
   void writeArchive(const std::string& archive_name, const IValue& value) {
     std::vector<char> data;
+    // Vector to capture the run-time class types during pickling the IValues
+    std::vector<c10::ClassTypePtr> memorizedClassTypes;
     Pickler data_pickle(
         [&](const char* buf, size_t size) {
           data.insert(data.end(), buf, buf + size);
         },
-        nullptr);
+        nullptr,
+        &memorizedClassTypes);
     data_pickle.protocol();
     data_pickle.pushIValue(value);
     data_pickle.stop();
@@ -577,6 +580,11 @@ class ScriptModuleSerializer {
     std::stringstream fname;
     fname << archive_name << ".pkl";
     writer_.writeRecord(fname.str(), data.data(), data.size());
+
+    // serialize all the captured run-time class types
+    for (const c10::ClassTypePtr& wroteType : memorizedClassTypes) {
+      convertNamedType(wroteType);
+    }
   }
 
   void writeExtraFiles(
diff --git a/torch/csrc/jit/pickler.cpp b/torch/csrc/jit/pickler.cpp
index 462120e5c4d4f..e70e8f14ec25a 100644
--- a/torch/csrc/jit/pickler.cpp
+++ b/torch/csrc/jit/pickler.cpp
@@ -105,6 +105,12 @@ void Pickler::pushIValueImpl(const IValue& ivalue) {
   } else if (ivalue.isObject()) {
     auto obj = ivalue.toObject();
     auto type = obj->type();
+    if (memorized_class_types_ != nullptr) {
+      // Memorize every class type the Pickler encountered
+      // This is used to make sure we capture all the run-time types
+      // and serialize them properly for class/interface polymorphism
+      memorized_class_types_->emplace_back(type);
+    }
     pushGlobal(type->name()->prefix(), type->name()->name());
     push<PickleOpCode>(PickleOpCode::EMPTY_TUPLE);
     push<PickleOpCode>(PickleOpCode::NEWOBJ);
diff --git a/torch/csrc/jit/pickler.h b/torch/csrc/jit/pickler.h
index 1fa522a12c25c..1477bc51b5333 100644
--- a/torch/csrc/jit/pickler.h
+++ b/torch/csrc/jit/pickler.h
@@ -124,8 +124,11 @@ class Pickler {
  public:
   Pickler(
       std::function<void(const char*, size_t)> writer,
-      std::vector<at::Tensor>* tensor_table)
-      : writer_(writer), tensor_table_(tensor_table) {}
+      std::vector<at::Tensor>* tensor_table,
+      std::vector<c10::ClassTypePtr>* memorized_class_types = nullptr)
+      : writer_(writer),
+        tensor_table_(tensor_table),
+        memorized_class_types_(memorized_class_types) {}
 
   // Push protocol onto the stack
   void protocol();
@@ -141,6 +144,7 @@ class Pickler {
   const std::vector<WriteableTensorData>& tensorData() {
     return tensor_data_;
   }
+
   void pushEmptyDict();
   void pushDict(const IValue& ivalue);
   void pushInt(int64_t value);
@@ -213,6 +217,9 @@ class Pickler {
   // object, and we will alias it to the old object at that address.
   std::vector<IValue> memoized_ivalues_;
 
+  // List of all the types that it wrote, inspect from the IValues it wrote.
+  std::vector<c10::ClassTypePtr>* memorized_class_types_;
+
   // List of tensor storages to serialize in the same binary as the pickle data
   // similar to ivalues, they are memoized using BINPUT
   std::vector<WriteableTensorData> tensor_data_;

From 7b3881f68cb01dd0b98fb3ff5ef9618ac286a5ef Mon Sep 17 00:00:00 2001
From: Zafar Takhirov <cc.rafaz@zafar.cc>
Date: Fri, 4 Oct 2019 17:18:16 -0700
Subject: [PATCH 079/113] Adding docstrings for nnq.functional

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27363

Test Plan: Imported from OSS

Differential Revision: D17758907

Pulled By: zafartahirov

fbshipit-source-id: f560f2726cf51ceebdbf22ebef2d067422340cf2
---
 torch/nn/quantized/functional.py | 280 +++++++++++++++++++++++++------
 1 file changed, 232 insertions(+), 48 deletions(-)

diff --git a/torch/nn/quantized/functional.py b/torch/nn/quantized/functional.py
index 79c9d0c6ff9cb..462c70776fcd7 100644
--- a/torch/nn/quantized/functional.py
+++ b/torch/nn/quantized/functional.py
@@ -8,54 +8,58 @@
 from torch._jit_internal import List as _List
 from torch.nn.modules.utils import _pair
 
+# Although some of the functions and docstrings are mirrored from the torch.nn,
+# we want to have them here for future changes.
 
-def relu(input, inplace=False):
-    # type: (Tensor, bool) -> Tensor
-    r"""relu(input, inplace=False) -> Tensor
+def avg_pool2d(input, kernel_size, stride=None, padding=0, ceil_mode=False,
+               count_include_pad=True, divisor_override=None):
+    r"""
+    Applies 2D average-pooling operation in :math:`kH \times kW` regions by step size
+    :math:`sH \times sW` steps. The number of output features is equal to the number of
+    input planes.
 
-    Applies the rectified linear unit function element-wise. See
-    :class:`~torch.nn.ReLU` for more details.
+    .. note:: The input quantization parameters propagate to the output.
+
+    See :class:`~torch.nn.quantized.AvgPool2d` for details and output shape.
+
+    Args:
+        input: quantized input tensor :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
+        kernel_size: size of the pooling region. Can be a single number or a
+          tuple `(kH, kW)`
+        stride: stride of the pooling operation. Can be a single number or a
+          tuple `(sH, sW)`. Default: :attr:`kernel_size`
+        padding: implicit zero paddings on both sides of the input. Can be a
+          single number or a tuple `(padH, padW)`. Default: 0
+        ceil_mode: when True, will use `ceil` instead of `floor` in the formula
+            to compute the output shape. Default: ``False``
+        count_include_pad: when True, will include the zero-padding in the
+            averaging calculation. Default: ``True``
+        divisor_override: if specified, it will be used as divisor, otherwise
+             size of the pooling region will be used. Default: None
     """
     if not input.is_quantized:
-        raise ValueError("Input to 'quantized.relu' must be quantized!")
-    if inplace:
-        return torch.relu_(input)
-    else:
-        return torch.relu(input)
+        raise ValueError("Input to 'quantized.avg_pool2d' must be quantized!")
+    return torch.nn.functional.avg_pool2d(input, kernel_size, stride, padding,
+                                          ceil_mode, count_include_pad,
+                                          divisor_override)
 
-def linear(input, weight, bias=None, scale=None, zero_point=None):
-    # type: (Tensor, Tensor, Optional[Tensor]) -> Tensor
+def adaptive_avg_pool2d(input, output_size):
+    # type: (Tensor, BroadcastingList2[int]) -> Tensor
     r"""
-    Applies a linear transformation to the incoming quantized data:
-    :math:`y = xA^T + b`.
-    See :class:`~torch.nn.Linear`
+    Applies a 2D adaptive average pooling over a quantized input signal composed
+    of several quantized input planes.
 
-    .. note::
+    .. note:: The input quantization paramteres propagate to the output.
 
-      Current implementation uses packed weights. This has penalty on performance.
-      If you want to avoid the overhead, use :class:`~torch.nn.quantized.Linear`.
+    See :class:`~torch.nn.quantized.AdaptiveAvgPool2d` for details and output shape.
 
     Args:
-      input (Tensor): Quantized input of type `torch.quint8`
-      weight (Tensor): Quantized weight of type `torch.qint8`
-      bias (Tensor): None or fp32 bias of type `torch.float`
-      scale (double): output scale. If None, derived from the input scale
-      zero_point (long): output zero point. If None, derived from the input zero_point
-
-    Shape:
-        - Input: :math:`(N, *, in\_features)` where `*` means any number of
-          additional dimensions
-        - Weight: :math:`(out\_features, in\_features)`
-        - Bias: :math:`(out\_features)`
-        - Output: :math:`(N, *, out\_features)`
+        output_size: the target output size (single integer or
+                     double-integer tuple)
     """
-    if scale is None:
-        scale = input.q_scale()
-    if zero_point is None:
-        zero_point = input.q_zero_point()
-    _packed_params = torch.ops.quantized.linear_prepack(weight, bias)
-    return torch.ops.quantized.linear(input, _packed_params, scale,
-                                      zero_point)
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.adaptive_avg_pool2d' must be quantized!")
+    return torch.nn.functional.adaptive_avg_pool2d(input, output_size)
 
 def conv2d(input, weight, bias,
            stride=1, padding=0, dilation=1, groups=1,
@@ -63,16 +67,10 @@ def conv2d(input, weight, bias,
            scale=1.0, zero_point=0,
            dtype=torch.quint8):
     r"""
-    conv2d(input, weight, bias,
-           stride=1, padding=0, dilation=1, groups=1,
-           padding_mode='zeros',
-           scale=1.0, zero_point=0,
-           dtype=torch.quint8) -> Tensor
-
     Applies a 2D convolution over a quantized 2D input composed of several input
     planes.
 
-    See :class:`~torch.nn.Conv2d` for details and output shape.
+    See :class:`~torch.nn.quantized.Conv2d` for details and output shape.
 
     Args:
         input: quantized input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
@@ -120,11 +118,87 @@ def conv2d(input, weight, bias,
                                       stride, padding, dilation,
                                       groups, scale, zero_point)
 
+def interpolate(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
+    r"""Down/up samples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    See :func:`torch.nn.functional.interpolate` for implementation details.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D input is supported for quantized inputs
+
+    .. note:: Only the following modes are supported for the quantized inputs:
+
+        - `bilinear`
+        - `nearest`
+
+    Args:
+        input (Tensor): the input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to match input size if it is a tuple.
+        mode (str): algorithm used for upsampling:
+            ``'nearest'`` | ``'bilinear'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'bilinear'``.
+            Default: ``False``
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.interpolate' must be quantized!")
+    return torch.nn.functional.interpolate(input, size, scale_factor, mode,
+                                           align_corners)
+
+def linear(input, weight, bias=None, scale=None, zero_point=None):
+    # type: (Tensor, Tensor, Optional[Tensor]) -> Tensor
+    r"""
+    Applies a linear transformation to the incoming quantized data:
+    :math:`y = xA^T + b`.
+    See :class:`~torch.nn.quantized.Linear`
+
+    .. note::
+
+      Current implementation packs weights on every call, which has penalty on performance.
+      If you want to avoid the overhead, use :class:`~torch.nn.quantized.Linear`.
+
+    Args:
+      input (Tensor): Quantized input of type `torch.quint8`
+      weight (Tensor): Quantized weight of type `torch.qint8`
+      bias (Tensor): None or fp32 bias of type `torch.float`
+      scale (double): output scale. If None, derived from the input scale
+      zero_point (long): output zero point. If None, derived from the input zero_point
+
+    Shape:
+        - Input: :math:`(N, *, in\_features)` where `*` means any number of
+          additional dimensions
+        - Weight: :math:`(out\_features, in\_features)`
+        - Bias: :math:`(out\_features)`
+        - Output: :math:`(N, *, out\_features)`
+    """
+    if scale is None:
+        scale = input.q_scale()
+    if zero_point is None:
+        zero_point = input.q_zero_point()
+    _packed_params = torch.ops.quantized.linear_prepack(weight, bias)
+    return torch.ops.quantized.linear(input, _packed_params, scale, zero_point)
+
 def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
                ceil_mode=False, return_indices=False):
     r"""Applies a 2D max pooling over a quantized input signal composed of
     several quantized input planes.
 
+    .. note:: The input quantization parameters are propagated to the output.
+
     See :class:`~torch.nn.quantized.MaxPool2d` for details.
     """
     if return_indices:
@@ -134,7 +208,117 @@ def max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1,
     return torch.nn.functional.max_pool2d(input, kernel_size, stride, padding,
                                           dilation, ceil_mode, return_indices)
 
-# TODO(zaf): Add documentation
-adaptive_avg_pool2d = torch.nn.functional.adaptive_avg_pool2d
-interpolate = torch.nn.functional.interpolate
-avg_pool2d = torch.nn.functional.avg_pool2d
+def relu(input, inplace=False):
+    # type: (Tensor, bool) -> Tensor
+    r"""relu(input, inplace=False) -> Tensor
+
+    Applies the rectified linear unit function element-wise.
+    See :class:`~torch.nn.quantized.ReLU` for more details.
+
+    Args:
+        input: quantized input
+        inplace: perform the computation inplace
+    """
+    if not input.is_quantized:
+        raise ValueError("Input to 'quantized.relu' must be quantized!")
+    if inplace:
+        return torch.relu_(input)
+    else:
+        return torch.relu(input)
+
+def upsample(input, size=None, scale_factor=None, mode='nearest', align_corners=None):
+    r"""Upsamples the input to either the given :attr:`size` or the given
+    :attr:`scale_factor`
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.nn.quantized.functional.interpolate`.
+        This is equivalent with ``nn.quantized.functional.interpolate(...)``.
+
+    See :func:`torch.nn.functional.interpolate` for implementation details.
+
+    The input dimensions are interpreted in the form:
+    `mini-batch x channels x [optional depth] x [optional height] x width`.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D input is supported for quantized inputs
+
+    .. note:: Only the following modes are supported for the quantized inputs:
+
+        - `bilinear`
+        - `nearest`
+
+    Args:
+        input (Tensor): quantized input tensor
+        size (int or Tuple[int] or Tuple[int, int] or Tuple[int, int, int]):
+            output spatial size.
+        scale_factor (float or Tuple[float]): multiplier for spatial size. Has to be an integer.
+        mode (string): algorithm used for upsampling:
+            ``'nearest'`` | ``'bilinear'``
+        align_corners (bool, optional): Geometrically, we consider the pixels of the
+            input and output as squares rather than points.
+            If set to ``True``, the input and output tensors are aligned by the
+            center points of their corner pixels, preserving the values at the corner pixels.
+            If set to ``False``, the input and output tensors are aligned by the corner
+            points of their corner pixels, and the interpolation uses edge value padding
+            for out-of-boundary values, making this operation *independent* of input size
+            when :attr:`scale_factor` is kept the same. This only has an effect when :attr:`mode`
+            is ``'bilinear'``.
+            Default: ``False``
+
+    .. warning::
+        With ``align_corners = True``, the linearly interpolating modes
+        (`bilinear`) don't proportionally align the
+        output and input pixels, and thus the output values can depend on the
+        input size. This was the default behavior for these modes up to version
+        0.3.1. Since then, the default behavior is ``align_corners = False``.
+        See :class:`~torch.nn.Upsample` for concrete examples on how this
+        affects the outputs.
+    """
+    warnings.warn("nn.quantized.functional.upsample is deprecated. Use nn.quantized.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode, align_corners)
+
+def upsample_bilinear(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using bilinear upsampling.
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.nn.quantized.functional.interpolate`.
+        This is equivalent with
+        ``nn.quantized.functional.interpolate(..., mode='bilinear', align_corners=True)``.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D inputs are supported
+
+    Args:
+        input (Tensor): quantized input
+        size (int or Tuple[int, int]): output spatial size.
+        scale_factor (int or Tuple[int, int]): multiplier for spatial size
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.quantized.functional.upsample_bilinear is deprecated. Use nn.quantized.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode='bilinear', align_corners=True)
+
+def upsample_nearest(input, size=None, scale_factor=None):
+    r"""Upsamples the input, using nearest neighbours' pixel values.
+
+    .. warning::
+        This function is deprecated in favor of
+        :func:`torch.nn.quantized.functional.interpolate`.
+        This is equivalent with ``nn.quantized.functional.interpolate(..., mode='nearest')``.
+
+    .. note:: The input quantization parameters propagate to the output.
+
+    .. note:: Only 2D inputs are supported
+
+    Args:
+        input (Tensor): quantized input
+        size (int or Tuple[int, int] or Tuple[int, int, int]): output spatial
+            size.
+        scale_factor (int): multiplier for spatial size. Has to be an integer.
+    """
+    # DeprecationWarning is ignored by default
+    warnings.warn("nn.quantized.functional.upsample_nearest is deprecated. Use nn.quantized.functional.interpolate instead.")
+    return interpolate(input, size, scale_factor, mode='nearest')

From f4d0d0a8112b450a548be6f494f36e14c5528c78 Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Fri, 4 Oct 2019 17:39:53 -0700
Subject: [PATCH 080/113] Enable RCCL in ROCm build (#27383)

Summary:
continues https://github.com/pytorch/pytorch/pull/23884
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27383

Differential Revision: D17767248

Pulled By: bddppq

fbshipit-source-id: 3a506844ca6f01d7bbe8be5bde0976999e3a2b90
---
 CMakeLists.txt                                 |  2 +-
 cmake/Dependencies.cmake                       | 18 +++++++++++++-----
 cmake/External/rccl.cmake                      | 18 ++++++++++++++++++
 cmake/public/LoadHIP.cmake                     | 12 ++++++++++++
 .../amd_build/pyHIPIFY/cuda_to_hip_mappings.py |  5 +++--
 tools/amd_build/pyHIPIFY/hipify_python.py      |  2 +-
 torch/CMakeLists.txt                           |  2 --
 7 files changed, 48 insertions(+), 11 deletions(-)
 create mode 100644 cmake/External/rccl.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 909d6b914a26b..4fac53ec586ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,7 +140,7 @@ option(USE_METAL "Use Metal for iOS build" ON)
 option(USE_NATIVE_ARCH "Use -march=native" OFF)
 cmake_dependent_option(
     USE_NCCL "Use NCCL" ON
-    "USE_CUDA;UNIX;NOT APPLE" OFF)
+    "USE_CUDA OR USE_ROCM;UNIX;NOT APPLE" OFF)
 cmake_dependent_option(
     USE_STATIC_NCCL "Use static NCCL" OFF
     "USE_NCCL" OFF)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a8e9769536cb8..28b6bbfd6dcd4 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -948,6 +948,11 @@ if(USE_ROCM)
     message(INFO "Compiling with HIP for AMD.")
     caffe2_update_option(USE_ROCM ON)
 
+    if (USE_NCCL AND NOT USE_SYSTEM_NCCL)
+      message(INFO "Forcing USE_SYSTEM_NCCL to ON since it's required by using RCCL")
+      caffe2_update_option(USE_SYSTEM_NCCL ON)
+    endif()
+
     list(APPEND HIP_CXX_FLAGS -fPIC)
     list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1)
     list(APPEND HIP_CXX_FLAGS -DCUDA_HAS_FP16=1)
@@ -983,7 +988,7 @@ if(USE_ROCM)
     hip_include_directories(${Caffe2_HIP_INCLUDE})
 
     set(Caffe2_HIP_DEPENDENCY_LIBS
-      ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB})
+      ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${PYTORCH_RCCL_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB})
 
     # Note [rocblas & rocfft cmake bug]
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -1006,17 +1011,20 @@ endif()
 
 # ---[ NCCL
 if(USE_NCCL)
-  if(NOT USE_CUDA)
+  if(NOT (USE_CUDA OR USE_ROCM))
     message(WARNING
-        "Not using CUDA, so disabling NCCL. Suppress this warning with "
+        "Not using CUDA/ROCM, so disabling USE_NCCL. Suppress this warning with "
         "-DUSE_NCCL=OFF.")
     caffe2_update_option(USE_NCCL OFF)
   elseif(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
     message(WARNING "NCCL is currently only supported under Linux.")
     caffe2_update_option(USE_NCCL OFF)
-  else()
+  elseif(USE_CUDA)
     include(${CMAKE_CURRENT_LIST_DIR}/External/nccl.cmake)
     list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS __caffe2_nccl)
+  elseif(USE_ROCM)
+    include(${CMAKE_CURRENT_LIST_DIR}/External/rccl.cmake)
+    list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS __caffe2_nccl)
   endif()
 endif()
 
@@ -1058,7 +1066,7 @@ if(USE_GLOO)
     # Add explicit dependency since NCCL is built from third_party.
     # Without dependency, make -jN with N>1 can fail if the NCCL build
     # hasn't finished when CUDA targets are linked.
-    if(USE_NCCL)
+    if(USE_NCCL AND NOT USE_ROCM)
       add_dependencies(gloo_cuda nccl_external)
     endif()
     # Pick the right dependency depending on USE_CUDA
diff --git a/cmake/External/rccl.cmake b/cmake/External/rccl.cmake
new file mode 100644
index 0000000000000..bc9265d4a3cc2
--- /dev/null
+++ b/cmake/External/rccl.cmake
@@ -0,0 +1,18 @@
+if (NOT __NCCL_INCLUDED)
+  set(__NCCL_INCLUDED TRUE)
+
+  if (USE_SYSTEM_NCCL)
+    # NCCL_ROOT, NCCL_LIB_DIR, NCCL_INCLUDE_DIR will be accounted in the following line.
+    find_package(RCCL REQUIRED)
+    if (RCCL_FOUND)
+      message (STATUS "RCCL Found!")
+      add_library(__caffe2_nccl INTERFACE)
+      target_link_libraries(__caffe2_nccl INTERFACE ${PYTORCH_RCCL_LIBRARIES})
+      target_include_directories(__caffe2_nccl INTERFACE ${RCCL_INCLUDE_DIRS})
+    else()
+      message (STATUS "RCCL NOT Found!")
+    endif()
+  else()
+    message (STATUS "USE_SYSTEM_NCCL=OFF is not supported yet when using RCCL")
+  endif()
+endif()
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index e552ade492b68..cdf27cdf66d0d 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -80,6 +80,13 @@ ELSE()
   SET(MIOPEN_PATH $ENV{MIOPEN_PATH})
 ENDIF()
 
+# RCCL_PATH
+IF(NOT DEFINED ENV{RCCL_PATH})
+  SET(RCCL_PATH ${ROCM_PATH}/rccl)
+ELSE()
+  SET(RCCL_PATH $ENV{RCCL_PATH})
+ENDIF()
+
 # ROCPRIM_PATH
 IF(NOT DEFINED ENV{ROCPRIM_PATH})
   SET(ROCPRIM_PATH ${ROCM_PATH}/rocprim)
@@ -145,6 +152,7 @@ IF(HIP_FOUND)
   set(miopen_DIR ${MIOPEN_PATH}/lib/cmake/miopen)
   set(rocfft_DIR ${ROCFFT_PATH}/lib/cmake/rocfft)
   set(hipsparse_DIR ${HIPSPARSE_PATH}/lib/cmake/hipsparse)
+  set(rccl_DIR ${RCCL_PATH}/lib/cmake/rccl)
   set(rocprim_DIR ${ROCPRIM_PATH}/lib/cmake/rocprim)
   set(hipcub_DIR ${HIPCUB_PATH}/lib/cmake/hipcub)
   set(rocthrust_DIR ${ROCTHRUST_PATH}/lib/cmake/rocthrust)
@@ -155,6 +163,7 @@ IF(HIP_FOUND)
   find_package_and_print_version(miopen REQUIRED)
   find_package_and_print_version(rocfft REQUIRED)
   find_package_and_print_version(hipsparse REQUIRED)
+  find_package_and_print_version(rccl)
   find_package_and_print_version(rocprim REQUIRED)
   find_package_and_print_version(hipcub REQUIRED)
   find_package_and_print_version(rocthrust REQUIRED)
@@ -167,6 +176,9 @@ IF(HIP_FOUND)
   # TODO: miopen_LIBRARIES should return fullpath to the library file,
   # however currently it's just the lib name
   FIND_LIBRARY(PYTORCH_MIOPEN_LIBRARIES ${miopen_LIBRARIES} HINTS ${MIOPEN_PATH}/lib)
+  # TODO: rccl_LIBRARIES should return fullpath to the library file,
+  # however currently it's just the lib name
+  FIND_LIBRARY(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES} HINTS ${RCCL_PATH}/lib)
   # hiprtc is part of HIP
   FIND_LIBRARY(ROCM_HIPRTC_LIB hiprtc HINTS ${HIP_PATH}/lib)
 
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index e220e9e306138..12648b9a2e571 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -278,8 +278,9 @@
     ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)),
     ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
     ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
+    ("<nccl.h>", ("<rccl.h>", CONV_INCLUDE, API_RUNTIME)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
     ("nvrtc.h", ("hip/hiprtc.h", CONV_INCLUDE, API_RTC)),
-    ("thrust/system/cuda/", ("thrust/system/hip/", CONV_INCLUDE, API_BLAS)),
+    ("thrust/system/cuda", ("thrust/system/hip", CONV_INCLUDE, API_BLAS)),
     ("cub/util_allocator.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
     ("cub/block/block_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
     ("cub/cub.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
@@ -2205,7 +2206,7 @@
     ("nvrtcGetProgramLogSize", ("hiprtcGetProgramLogSize", CONV_JIT, API_RTC)),
     ("nvrtcGetPTX", ("hiprtcGetCode", CONV_JIT, API_RTC)),
     ("nvrtcGetPTXSize", ("hiprtcGetCodeSize", CONV_JIT, API_RTC)),
-    ("thrust::cuda::", ("thrust::hip::", CONV_MATH_FUNC, API_BLAS)),
+    ("thrust::cuda", ("thrust::hip", CONV_MATH_FUNC, API_BLAS)),
     ("cub::", ("hipcub::", CONV_MATH_FUNC, API_BLAS)),
 ])
 
diff --git a/tools/amd_build/pyHIPIFY/hipify_python.py b/tools/amd_build/pyHIPIFY/hipify_python.py
index dace0c70f0c95..2a0de6b36667c 100755
--- a/tools/amd_build/pyHIPIFY/hipify_python.py
+++ b/tools/amd_build/pyHIPIFY/hipify_python.py
@@ -594,7 +594,7 @@ def pattern(self):
             CAFFE2_TRIE.add(src)
             CAFFE2_MAP[src] = dst
 RE_CAFFE2_PREPROCESSOR = re.compile(CAFFE2_TRIE.pattern())
-RE_PYTORCH_PREPROCESSOR = re.compile(r'\b{0}\b'.format(PYTORCH_TRIE.pattern()))
+RE_PYTORCH_PREPROCESSOR = re.compile(r'(?<=\W)({0})(?=\W)'.format(PYTORCH_TRIE.pattern()))
 
 RE_QUOTE_HEADER = re.compile(r'#include "([^"]+)"')
 RE_ANGLE_HEADER = re.compile(r'#include <([^>]+)>')
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index 43481ee687064..ad0b5a2930066 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -251,8 +251,6 @@ if (USE_NCCL)
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
     list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
-    if (USE_SYSTEM_NCCL)
-    endif()
 endif()
 
 # In the most recent CMake versions, a new 'TRANSFORM' subcommand of 'list' allows much of the boilerplate of defining the lists

From 1f0328c6d41ef7bd69c915c329f5788816de7966 Mon Sep 17 00:00:00 2001
From: Hao Lu <hlu@fb.com>
Date: Fri, 4 Oct 2019 18:25:08 -0700
Subject: [PATCH 081/113] Add randomFill to test_utils.h

Summary: Add helper function randomFill to test_utils.h so we can use it in benchmark scrips as well tests.

Test Plan:
```
buck run mode/opt //tvm/sparse:cblas_bench
```

Reviewed By: yinghai

Differential Revision: D17759193

fbshipit-source-id: e4909b04e83ca9382ab4718855fb63743d028de1
---
 caffe2/core/test_utils.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/caffe2/core/test_utils.h b/caffe2/core/test_utils.h
index d60cdaeecd5f7..68214b235b4c6 100644
--- a/caffe2/core/test_utils.h
+++ b/caffe2/core/test_utils.h
@@ -18,7 +18,9 @@ namespace caffe2 {
 namespace testing {
 
 // Asserts that the values of two tensors are the same.
-CAFFE2_API void assertTensorEquals(const TensorCPU& tensor1, const TensorCPU& tensor2);
+CAFFE2_API void assertTensorEquals(
+    const TensorCPU& tensor1,
+    const TensorCPU& tensor2);
 
 // Asserts that two float values are close within epsilon.
 CAFFE2_API void assertNear(float value1, float value2, float epsilon);
@@ -74,6 +76,22 @@ CAFFE2_API caffe2::OperatorDef* createOperator(
     const std::vector<std::string>& outputs,
     caffe2::NetDef* net);
 
+// Fill a buffer with randomly generated numbers given range [min, max)
+// T can only be float, double or long double
+template <typename RealType = float>
+void randomFill(
+    RealType* data,
+    size_t size,
+    const double min = 0.0,
+    const double max = 1.0) {
+  std::mt19937 gen(42);
+  std::uniform_real_distribution<RealType> dis(
+      static_cast<RealType>(min), static_cast<RealType>(max));
+  for (size_t i = 0; i < size; i++) {
+    data[i] = dis(gen);
+  }
+}
+
 // Fill data from a vector to a tensor.
 template <typename T>
 void fillTensor(

From 1741adfd3e1701ded9f3a125cd198eafd9ea3a89 Mon Sep 17 00:00:00 2001
From: BowenBao <bowbao@microsoft.com>
Date: Fri, 4 Oct 2019 19:00:08 -0700
Subject: [PATCH 082/113] Use deepcopy inputs for ONNX ort test cases (#27186)

Summary:
Running models with inplace operators will change values of input tensors.
Deepcopy input tensors each time to keep the original input tensors intact.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27186

Differential Revision: D17776598

Pulled By: jerryzh168

fbshipit-source-id: d4808a11185a9ab0d782a62d7d708dfe7e94559c
---
 test/onnx/test_pytorch_onnx_onnxruntime.py | 28 +++++++++++++++++-----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 4648d1b06b543..307d942b5b971 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -10,6 +10,7 @@
 import numpy as np
 import io
 import itertools
+import copy
 
 from torch.nn.utils import rnn as rnn_utils
 from model_defs.lstm_flattening_result import LstmFlatteningResult
@@ -57,13 +58,17 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
     with torch.no_grad():
         if isinstance(input, torch.Tensor):
             input = (input,)
-        output = model(*input)
+        # In-place operators will update input tensor data as well.
+        # Thus inputs are replicated before every forward call.
+        input_copy = copy.deepcopy(input)
+        output = model(*input_copy)
         if isinstance(output, torch.Tensor):
             output = (output,)
 
         # export the model to ONNX
         f = io.BytesIO()
-        torch.onnx._export(model, input, f,
+        input_copy = copy.deepcopy(input)
+        torch.onnx._export(model, input_copy, f,
                            opset_version=self.opset_version,
                            example_outputs=output,
                            do_constant_folding=do_constant_folding,
@@ -74,7 +79,8 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
 
         # compute onnxruntime output prediction
         ort_sess = onnxruntime.InferenceSession(f.getvalue())
-        ort_test_with_input(ort_sess, input, output, rtol, atol)
+        input_copy = copy.deepcopy(input)
+        ort_test_with_input(ort_sess, input_copy, output, rtol, atol)
 
         # if addiional test inputs are provided run the onnx
         # model with these inputs and check the outputs
@@ -82,7 +88,8 @@ def run_model_test(self, model, batch_size=2, state_dict=None,
             for test_input in test_with_inputs:
                 if isinstance(test_input, torch.Tensor):
                     test_input = (test_input,)
-                output = model(*test_input)
+                test_input_copy = copy.deepcopy(test_input)
+                output = model(*test_input_copy)
                 if isinstance(output, torch.Tensor):
                     output = (output,)
                 ort_test_with_input(ort_sess, test_input, output, rtol, atol)
@@ -1152,19 +1159,28 @@ def forward(self, x):
         x = torch.randn(2, 4, 5, 6, requires_grad=True)
         self.run_test(GeluModel(), x)
 
+    def test_add_inplace(self):
+        class InplaceAddModel(torch.nn.Module):
+            def forward(self, x):
+                x += 12
+                return x
+
+        x = torch.randn(4, 2, 3, requires_grad=True)
+        self.run_test(InplaceAddModel(), x)
+
     def test_rsqrt(self):
         class RsqrtModel(torch.nn.Module):
             def forward(self, x):
                 return x.rsqrt()
 
-        x = torch.randn(4, 2, 3, requires_grad=True).to(dtype=torch.float64)
+        x = torch.randn(4, 2, 3, requires_grad=True, dtype=torch.float64)
         self.run_test(RsqrtModel(), x)
 
     def test_rsqrt_zeros(self):
         class RsqrtModel(torch.nn.Module):
             def forward(self, x):
                 return x.rsqrt()
-        x = torch.zeros(4, 2, 3, requires_grad=True).to(dtype=torch.float64)
+        x = torch.zeros(4, 2, 3, requires_grad=True, dtype=torch.float64)
         self.run_test(RsqrtModel(), x)
 
     # TODO: enable opset 11 test once ORT support for unique is in

From 7a2e61c28ee27f8320969870d5c6460f59bb8bfd Mon Sep 17 00:00:00 2001
From: Shen Li <cs.shenli@gmail.com>
Date: Fri, 4 Oct 2019 21:22:46 -0700
Subject: [PATCH 083/113] Remove dependency on six from dist_autograd_test.py

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27369

Test Plan: Imported from OSS

Differential Revision: D17763104

Pulled By: mrshenli

fbshipit-source-id: dd146809686e7720f2b77012eebb6aed72851556
---
 test/dist_autograd_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/dist_autograd_test.py b/test/dist_autograd_test.py
index c6eb071e07b83..d77100887bbdf 100644
--- a/test/dist_autograd_test.py
+++ b/test/dist_autograd_test.py
@@ -3,7 +3,6 @@
 import time
 import unittest
 
-import six
 import torch
 import torch.distributed as dist
 import torch.distributed.autograd as dist_autograd
@@ -22,7 +21,7 @@ def _set_rpc_done(context_id):
 
 
 @unittest.skipIf(
-    not six.PY3, "Pytorch distributed autograd package " "does not support python2"
+    not torch._six.PY3, "Pytorch distributed autograd package " "does not support python2"
 )
 class DistAutogradTest(object):
     @property

From e0ae3ce5e4b5a98c8dd67b9ec1ea0f81dfc52fef Mon Sep 17 00:00:00 2001
From: Ayman Saleh <asaleh@exptechinc.com>
Date: Fri, 4 Oct 2019 21:24:44 -0700
Subject: [PATCH 084/113] Docstring fix (#27225)

Summary:
Correcting docstring for `add_image_with_boxes` method. Fixed spelling mistake.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27225

Differential Revision: D17776604

Pulled By: jerryzh168

fbshipit-source-id: 45f69643ec3b58c46b9fb67411c42a6d09b7290e
---
 torch/utils/tensorboard/writer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index 8d3065881f1ea..01e27b5813915 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -579,7 +579,7 @@ def add_image_with_boxes(self, tag, img_tensor, box_tensor, global_step=None,
             dataformats (string): Image data format specification of the form
               NCHW, NHWC, CHW, HWC, HW, WH, etc.
         Shape:
-            img_tensor: Default is :math:`(3, H, W)`. It can be specified with ``dataformat`` agrument.
+            img_tensor: Default is :math:`(3, H, W)`. It can be specified with ``dataformats`` argument.
             e.g. CHW or HWC
 
             box_tensor: (torch.Tensor, numpy.array, or string/blobname): NX4,  where N is the number of

From bac11d1002ab7306579216ea0c568492c7bc1b54 Mon Sep 17 00:00:00 2001
From: Dmytro Dzhulgakov <dzhulgakov@users.noreply.github.com>
Date: Fri, 4 Oct 2019 22:12:25 -0700
Subject: [PATCH 085/113] Tweak docs on building docs

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27364

Differential Revision: D17777402

Pulled By: dzhulgakov

fbshipit-source-id: 304c678e5c80d7f8c779d65c11f9bf1b0facdb52
---
 CONTRIBUTING.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1274d24c82af9..548f16f9c0210 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -218,6 +218,8 @@ pip install -r requirements.txt
 # `katex` must also be available in your PATH.
 # If you are using Ubuntu or Debian, you can install it with:
 # sudo apt install katex
+# If you are using MacOS, you can install it through npm (install Node.js first):
+# npm install -g katex
 ```
 
 3. Generate the documentation HTML files. The generated files will be in `docs/build/html`.

From 04436f6c609776b6aeeaf043fa3445a273ccc9cd Mon Sep 17 00:00:00 2001
From: Junjie Bai <bai@in.tum.de>
Date: Sat, 5 Oct 2019 00:35:10 -0700
Subject: [PATCH 086/113] Upgrade to ROCm 2.9 (#27417)

Summary:
New docker images built with tag 325: https://ci.pytorch.org/jenkins/job/caffe2-docker-trigger/325

Related ossci-job-dsl commits:
https://github.com/pytorch/ossci-job-dsl/commit/a00a76f927944aed961a3bbbc4f17aff0fc30d71
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27417

Differential Revision: D17777517

Pulled By: bddppq

fbshipit-source-id: a6b8cb86b37f537d402f6d2c7d28ad28a6a5a317
---
 .../cimodel/data/caffe2_build_definitions.py  |  2 +-
 .circleci/config.yml                          | 40 +++++++++----------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/.circleci/cimodel/data/caffe2_build_definitions.py b/.circleci/cimodel/data/caffe2_build_definitions.py
index 07c3a24c48921..c58492a8e1a79 100644
--- a/.circleci/cimodel/data/caffe2_build_definitions.py
+++ b/.circleci/cimodel/data/caffe2_build_definitions.py
@@ -14,7 +14,7 @@
 
 DOCKER_IMAGE_PATH_BASE = "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/"
 
-DOCKER_IMAGE_VERSION = 324
+DOCKER_IMAGE_VERSION = 325
 
 
 @dataclass
diff --git a/.circleci/config.yml b/.circleci/config.yml
index fb1808599691e..5f5bf784d70f6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1930,7 +1930,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-gcc4.8-ubuntu14.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:325"
       - caffe2_linux_test:
           name: caffe2_py2_gcc4_8_ubuntu14_04_test
           requires:
@@ -1942,7 +1942,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-gcc4.8-ubuntu14.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:325"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
@@ -1954,7 +1954,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:325"
       - caffe2_linux_test:
           name: caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test
           requires:
@@ -1967,14 +1967,14 @@ workflows:
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:325"
           resource_class: gpu.medium
       - caffe2_linux_build:
           name: caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:325"
       - caffe2_linux_test:
           name: caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_test
           requires:
@@ -1982,14 +1982,14 @@ workflows:
             - caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build
           build_environment: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:325"
           resource_class: gpu.medium
       - caffe2_linux_build:
           name: caffe2_py3_5_cuda10_1_cudnn7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py3.5-cuda10.1-cudnn7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:325"
       - caffe2_linux_test:
           name: caffe2_py3_5_cuda10_1_cudnn7_ubuntu16_04_test
           requires:
@@ -1997,35 +1997,35 @@ workflows:
             - caffe2_py3_5_cuda10_1_cudnn7_ubuntu16_04_build
           build_environment: "caffe2-py3.5-cuda10.1-cudnn7-ubuntu16.04-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.5-cuda10.1-cudnn7-ubuntu16.04:325"
           resource_class: gpu.medium
       - caffe2_linux_build:
           name: caffe2_py2_mkl_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py2-mkl-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:325"
       - caffe2_linux_test:
           name: caffe2_py2_mkl_ubuntu16_04_test
           requires:
             - setup
             - caffe2_py2_mkl_ubuntu16_04_build
           build_environment: "caffe2-py2-mkl-ubuntu16.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:325"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_onnx_py2_gcc5_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-onnx-py2-gcc5-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:325"
       - caffe2_linux_test:
           name: caffe2_onnx_py2_gcc5_ubuntu16_04_test
           requires:
             - setup
             - caffe2_onnx_py2_gcc5_ubuntu16_04_build
           build_environment: "caffe2-onnx-py2-gcc5-ubuntu16.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:325"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_py2_clang3_8_ubuntu16_04_build
@@ -2037,7 +2037,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-clang3.8-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:325"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_py2_clang3_9_ubuntu16_04_build
@@ -2049,35 +2049,35 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-clang3.9-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:325"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_py2_clang7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py2-clang7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:325"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_onnx_py3_6_clang7_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-onnx-py3.6-clang7-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:325"
       - caffe2_linux_test:
           name: caffe2_onnx_py3_6_clang7_ubuntu16_04_test
           requires:
             - setup
             - caffe2_onnx_py3_6_clang7_ubuntu16_04_build
           build_environment: "caffe2-onnx-py3.6-clang7-ubuntu16.04-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:325"
           resource_class: large
       - caffe2_linux_build:
           name: caffe2_py2_android_ubuntu16_04_build
           requires:
             - setup
           build_environment: "caffe2-py2-android-ubuntu16.04-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:325"
           build_only: "1"
       - caffe2_linux_build:
           name: caffe2_py2_cuda9_0_cudnn7_centos7_build
@@ -2089,7 +2089,7 @@ workflows:
                 - master
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-centos7-build"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:325"
       - caffe2_linux_test:
           name: caffe2_py2_cuda9_0_cudnn7_centos7_test
           requires:
@@ -2102,7 +2102,7 @@ workflows:
                 - /ci-all\/.*/
           build_environment: "caffe2-py2-cuda9.0-cudnn7-centos7-test"
           use_cuda_docker_runtime: "1"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:324"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:325"
           resource_class: gpu.medium
       - caffe2_macos_build:
           name: caffe2_py2_ios_macos10_13_build

From 17c672e704ec5b2cee4698a3dd17e0e409b0d57b Mon Sep 17 00:00:00 2001
From: Johannes M Dieterich <johannes.dieterich@amd.com>
Date: Sat, 5 Oct 2019 01:53:33 -0700
Subject: [PATCH 087/113] enable rocTX API (#27416)

Summary:
ROCm 2.9 brings support for the rocTX API through rocTracer.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27416

Differential Revision: D17777480

Pulled By: bddppq

fbshipit-source-id: 6bce9b54c94e5b4c5787570d2b85736882bd23a7
---
 cmake/Dependencies.cmake                         |  4 ++--
 cmake/public/LoadHIP.cmake                       | 11 ++++++++++-
 tools/amd_build/pyHIPIFY/constants.py            |  3 ++-
 tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py |  5 +++++
 torch/csrc/autograd/profiler_cuda.cpp            |  8 --------
 5 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 28b6bbfd6dcd4..e6855f345b190 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -983,12 +983,12 @@ if(USE_ROCM)
     endforeach()
 
     set(Caffe2_HIP_INCLUDE
-      ${thrust_INCLUDE_DIRS} ${hipcub_INCLUDE_DIRS} ${rocprim_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDE})
+       ${thrust_INCLUDE_DIRS} ${hipcub_INCLUDE_DIRS} ${rocprim_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${roctracer_INCLUDE_DIRS} ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDE})
     # This is needed for library added by hip_add_library (same for hip_add_executable)
     hip_include_directories(${Caffe2_HIP_INCLUDE})
 
     set(Caffe2_HIP_DEPENDENCY_LIBS
-      ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${PYTORCH_RCCL_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB})
+      ${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${PYTORCH_RCCL_LIBRARIES} ${hipcub_LIBRARIES} ${ROCM_HIPRTC_LIB} ${ROCM_ROCTX_LIB})
 
     # Note [rocblas & rocfft cmake bug]
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/cmake/public/LoadHIP.cmake b/cmake/public/LoadHIP.cmake
index cdf27cdf66d0d..fd5e97d3e1e26 100644
--- a/cmake/public/LoadHIP.cmake
+++ b/cmake/public/LoadHIP.cmake
@@ -108,6 +108,13 @@ ELSE()
   SET(ROCTHRUST_PATH $ENV{ROCTHRUST_PATH})
 ENDIF()
 
+# ROCTRACER_PATH
+IF(NOT DEFINED ENV{ROCTRACER_PATH})
+  SET(ROCTRACER_PATH ${ROCM_PATH}/roctracer)
+ELSE()
+  SET(ROCTRACER_PATH $ENV{ROCTRACER_PATH})
+ENDIF()
+
 IF(NOT DEFINED ENV{PYTORCH_ROCM_ARCH})
   SET(PYTORCH_ROCM_ARCH gfx803;gfx900;gfx906;gfx908)
 ELSE()
@@ -181,7 +188,9 @@ IF(HIP_FOUND)
   FIND_LIBRARY(PYTORCH_RCCL_LIBRARIES ${rccl_LIBRARIES} HINTS ${RCCL_PATH}/lib)
   # hiprtc is part of HIP
   FIND_LIBRARY(ROCM_HIPRTC_LIB hiprtc HINTS ${HIP_PATH}/lib)
-
+  # roctx is part of roctracer
+  FIND_LIBRARY(ROCM_ROCTX_LIB roctx64 HINTS ${ROCTRACER_PATH}/lib)
+  set(roctracer_INCLUDE_DIRS ${ROCTRACER_PATH}/include)
 
   # Necessary includes for building PyTorch since we include HIP headers that depend on hcc/hsa headers.
   set(hcc_INCLUDE_DIRS ${HCC_PATH}/include)
diff --git a/tools/amd_build/pyHIPIFY/constants.py b/tools/amd_build/pyHIPIFY/constants.py
index 1384e6c891008..b0bd4f9f0313b 100644
--- a/tools/amd_build/pyHIPIFY/constants.py
+++ b/tools/amd_build/pyHIPIFY/constants.py
@@ -52,8 +52,9 @@
 API_LAST = 42
 API_FFT = 43
 API_RTC = 44
+API_ROCTX = 45
 
-HIP_UNSUPPORTED = 43
+HIP_UNSUPPORTED = 46
 API_PYTORCH = 1337
 API_CAFFE2 = 1338
 API_C10 = 1339
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
index 12648b9a2e571..1e04f687e457e 100644
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
@@ -287,6 +287,7 @@
     ("cub/block/block_load.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
     ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
     ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+    ("nvToolsExt.h", ("roctx.h", CONV_INCLUDE, API_ROCTX)),
 ])
 
 CUDA_IDENTIFIER_MAP = collections.OrderedDict([
@@ -2208,6 +2209,10 @@
     ("nvrtcGetPTXSize", ("hiprtcGetCodeSize", CONV_JIT, API_RTC)),
     ("thrust::cuda", ("thrust::hip", CONV_MATH_FUNC, API_BLAS)),
     ("cub::", ("hipcub::", CONV_MATH_FUNC, API_BLAS)),
+    ("nvtxMark", ("roctxMark", CONV_OTHER, API_ROCTX)),
+    ("nvtxMarkA", ("roctxMarkA", CONV_OTHER, API_ROCTX)),
+    ("nvtxRangePushA", ("roctxRangePushA", CONV_OTHER, API_ROCTX)),
+    ("nvtxRangePop", ("roctxRangePop", CONV_OTHER, API_ROCTX)),
 ])
 
 CUDA_SPARSE_MAP = collections.OrderedDict([
diff --git a/torch/csrc/autograd/profiler_cuda.cpp b/torch/csrc/autograd/profiler_cuda.cpp
index ae3b1bbc86a56..b03d574431877 100644
--- a/torch/csrc/autograd/profiler_cuda.cpp
+++ b/torch/csrc/autograd/profiler_cuda.cpp
@@ -1,8 +1,6 @@
 #include <torch/csrc/autograd/profiler.h>
 #include <c10/cuda/CUDAGuard.h>
-#ifndef __HIP_PLATFORM_HCC__
 #include <nvToolsExt.h>
-#endif
 
 #include <sstream>
 
@@ -35,19 +33,13 @@ struct CUDAMethods : public CUDAStubs {
     return ms*1000.0;
   }
   void nvtxMarkA(const char* name) override {
-#ifndef __HIP_PLATFORM_HCC__
     ::nvtxMark(name);
-#endif
   }
   void nvtxRangePushA(const char* name) override {
-#ifndef __HIP_PLATFORM_HCC__
     ::nvtxRangePushA(name);
-#endif
   }
   void nvtxRangePop() override {
-#ifndef __HIP_PLATFORM_HCC__
     ::nvtxRangePop();
-#endif
   }
   void onEachDevice(std::function<void(int)> op) override {
     at::cuda::OptionalCUDAGuard device_guard;

From 2cc1e69cc94e7d257a696d1d94170dbfd9be35ac Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Sat, 5 Oct 2019 06:16:44 -0700
Subject: [PATCH 088/113] C++ API parity: LogSigmoid

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27060

Test Plan: Imported from OSS

Differential Revision: D17682404

Pulled By: pbelevich

fbshipit-source-id: d60d64cd4caf1f56a2e05c516f91321d46ec9624
---
 test/cpp/api/functional.cpp                   | 13 ++++++++++++
 test/cpp/api/modules.cpp                      | 21 +++++++++++++++++++
 .../include/torch/nn/functional/activation.h  |  4 ++++
 .../api/include/torch/nn/modules/activation.h | 19 +++++++++++++++++
 torch/csrc/api/src/nn/modules/activation.cpp  | 12 +++++++++++
 5 files changed, 69 insertions(+)

diff --git a/test/cpp/api/functional.cpp b/test/cpp/api/functional.cpp
index cc694973445e9..0be5fbffeef4f 100644
--- a/test/cpp/api/functional.cpp
+++ b/test/cpp/api/functional.cpp
@@ -347,3 +347,16 @@ TEST_F(FunctionalTest, LeakyReLU) {
     }
   }
 }
+
+TEST_F(FunctionalTest, LogSigmoid) {
+  const auto size = 3;
+  LogSigmoid model;
+  auto x = torch::linspace(-10.0, 10.0, size * size * size);
+  x.resize_({size, size, size});
+  auto y = F::logsigmoid(x);
+
+  ASSERT_EQ(y.ndimension(), 3);
+  ASSERT_EQ(y.sizes(), torch::IntArrayRef({size, size, size}));
+  auto y_exp = torch::log(torch::ones_like(x)/(torch::ones_like(x) + torch::exp(torch::neg(x))));
+  ASSERT_TRUE(torch::allclose(y, y_exp, 1e-4, 1e-7));
+}
diff --git a/test/cpp/api/modules.cpp b/test/cpp/api/modules.cpp
index 064082fbd0360..715bb50f3ab63 100644
--- a/test/cpp/api/modules.cpp
+++ b/test/cpp/api/modules.cpp
@@ -1099,6 +1099,23 @@ TEST_F(ModulesTest, LeakyReLU) {
   }
 }
 
+TEST_F(ModulesTest, LogSigmoid) {
+  const auto size = 3;
+  LogSigmoid model;
+  auto x = torch::linspace(-10.0, 10.0, size * size * size);
+  x.resize_({size, size, size}).set_requires_grad(true);
+  auto y = model(x);
+  torch::Tensor s = y.sum();
+
+  s.backward();
+  ASSERT_EQ(s.ndimension(), 0);
+
+  ASSERT_EQ(y.ndimension(), 3);
+  ASSERT_EQ(y.sizes(), torch::IntArrayRef({size, size, size}));
+  auto y_exp = torch::log(torch::ones_like(x)/(torch::ones_like(x) + torch::exp(torch::neg(x))));
+  ASSERT_TRUE(torch::allclose(y, y_exp, 1e-4, 1e-7));
+}
+
 TEST_F(ModulesTest, PrettyPrintIdentity) {
   ASSERT_EQ(c10::str(Identity()), "torch::nn::Identity()");
 }
@@ -1349,3 +1366,7 @@ TEST_F(ModulesTest, PrettyPrintLeakyReLU) {
       LeakyReLUOptions().negative_slope(0.42).inplace(true))),
     "torch::nn::LeakyReLU(negative_slope=0.42, inplace=true)");
 }
+
+TEST_F(ModulesTest, PrettyPrintLogSigmoid) {
+  ASSERT_EQ(c10::str(LogSigmoid()), "torch::nn::LogSigmoid()");
+}
diff --git a/torch/csrc/api/include/torch/nn/functional/activation.h b/torch/csrc/api/include/torch/nn/functional/activation.h
index 684d57b731b89..4fbebed2bef49 100644
--- a/torch/csrc/api/include/torch/nn/functional/activation.h
+++ b/torch/csrc/api/include/torch/nn/functional/activation.h
@@ -36,6 +36,10 @@ inline Tensor leaky_relu(Tensor& input, const LeakyReLUOptions& options) {
   }
 }
 
+inline Tensor logsigmoid(const Tensor& input) {
+  return torch::log_sigmoid(input);
+}
+
 } // namespace functional
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/include/torch/nn/modules/activation.h b/torch/csrc/api/include/torch/nn/modules/activation.h
index 36d84641a4a51..748980cbb46da 100644
--- a/torch/csrc/api/include/torch/nn/modules/activation.h
+++ b/torch/csrc/api/include/torch/nn/modules/activation.h
@@ -101,5 +101,24 @@ class TORCH_API LeakyReLUImpl : public torch::nn::Cloneable<LeakyReLUImpl> {
 
 TORCH_MODULE(LeakyReLU);
 
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LogSigmoid ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+/// Applies the LogSigmoid function element-wise.
+/// See https://pytorch.org/docs/master/nn.html#torch.nn.LogSigmoid to learn
+/// about the exact behavior of this module.
+class TORCH_API LogSigmoidImpl : public torch::nn::Cloneable<LogSigmoidImpl> {
+ public:
+  LogSigmoidImpl() {}
+
+  Tensor forward(const Tensor& input);
+
+  void reset() override;
+
+  /// Pretty prints the `LogSigmoid` module into the given `stream`.
+  void pretty_print(std::ostream& stream) const override;
+};
+
+TORCH_MODULE(LogSigmoid);
+
 } // namespace nn
 } // namespace torch
diff --git a/torch/csrc/api/src/nn/modules/activation.cpp b/torch/csrc/api/src/nn/modules/activation.cpp
index bf3893297ee2d..ddbb493896d95 100644
--- a/torch/csrc/api/src/nn/modules/activation.cpp
+++ b/torch/csrc/api/src/nn/modules/activation.cpp
@@ -84,5 +84,17 @@ void LeakyReLUImpl::pretty_print(std::ostream& stream) const {
   stream << ")";
 }
 
+// ============================================================================
+
+Tensor LogSigmoidImpl::forward(const Tensor& input) {
+  return F::logsigmoid(input);
+}
+
+void LogSigmoidImpl::reset() {}
+
+void LogSigmoidImpl::pretty_print(std::ostream& stream) const {
+  stream << "torch::nn::LogSigmoid()";
+}
+
 } // namespace nn
 } // namespace torch

From 0b6186d7783175dcbea8bbab7aed20b592a9a01d Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Sun, 6 Oct 2019 09:35:44 -0700
Subject: [PATCH 089/113] Remove Tensor.h, TensorMethods.h from src/core.
 (#27086)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27086

This is a major source of merge conflicts, and AFAICT isn't necessary anymore (it may have been necessary for some mobile build stuff in the past).

This is a commandeer of #25031

Test Plan: Imported from OSS

Reviewed By: ljk53

Differential Revision: D17687345

Pulled By: ezyang

fbshipit-source-id: bf6131af835ed1f9e3c10699c81d4454a240445f
---
 aten/src/ATen/CMakeLists.txt                |    9 +-
 aten/src/ATen/core/OpsAlreadyMovedToC10.cpp | 1493 -----
 aten/src/ATen/core/TensorBody.h             | 1023 ---
 aten/src/ATen/core/TensorMethods.h          | 6236 -------------------
 aten/src/ATen/core/context_base.cpp         |   20 -
 aten/src/ATen/core/context_base.h           |  164 -
 aten/src/ATen/function_wrapper.py           |    4 +-
 aten/src/ATen/gen.py                        |   64 +-
 caffe2/CMakeLists.txt                       |    8 -
 caffe2/c2_aten_srcs.bzl                     |    2 -
 caffe2/core/context_base.cc                 |   17 +-
 caffe2/core/context_base.h                  |  165 +-
 caffe2/core/export_c10_op_to_caffe2.h       |    7 +-
 caffe2/core/export_caffe2_op_to_c10.h       |    6 +-
 caffe2/core/net_async_base.h                |    2 +-
 caffe2/core/operator.cc                     |    8 +-
 caffe2/core/operator.h                      |   34 +-
 caffe2/core/tensor.cc                       |    2 +
 caffe2/core/tensor.h                        |   10 +-
 caffe2/core/tensor_impl.h                   |    4 +-
 caffe2/operators/CMakeLists.txt             |   36 +-
 caffe2/operators/reduction_ops.h            |    4 +
 cmake/Codegen.cmake                         |   17 +-
 docs/cpp/source/Doxyfile                    |    2 +-
 24 files changed, 271 insertions(+), 9066 deletions(-)
 delete mode 100644 aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
 delete mode 100644 aten/src/ATen/core/TensorBody.h
 delete mode 100644 aten/src/ATen/core/TensorMethods.h
 delete mode 100644 aten/src/ATen/core/context_base.cpp
 delete mode 100644 aten/src/ATen/core/context_base.h

diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index dc150b645b737..b6fe232896069 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -82,7 +82,7 @@ FILE(GLOB native_quantized_hip_hip "native/quantized/hip/*.hip")
 FILE(GLOB native_quantized_hip_cpp "native/quantized/hip/*.cpp")
 
 add_subdirectory(quantized)
-set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${cpu_kernel_cpp})
+set(all_cpu_cpp ${base_cpp} ${ATen_CORE_SRCS} ${native_cpp} ${native_sparse_cpp} ${native_quantized_cpp} ${native_mkl_cpp} ${native_mkldnn_cpp} ${generated_cpp} ${core_generated_cpp} ${ATen_CPU_SRCS} ${ATen_QUANTIZED_SRCS} ${cpu_kernel_cpp})
 if(AT_MKL_ENABLED)
   set(all_cpu_cpp ${all_cpu_cpp} ${mkl_cpp})
 endif()
@@ -114,6 +114,7 @@ endif()
 
 filter_list(generated_h generated_cpp "\\.h$")
 filter_list(cuda_generated_h cuda_generated_cpp "\\.h$")
+filter_list(core_generated_h core_generated_cpp "\\.h$")
 # TODO: When we have hip_generated_cpp
 #filter_list(hip_generated_h hip_generated_cpp "\\.h$")
 
@@ -459,6 +460,12 @@ FOREACH(HEADER ${generated_h} ${cuda_generated_h})
   INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen)
 ENDFOREACH()
 
+message("AT_INSTALL_INCLUDE_DIR ${AT_INSTALL_INCLUDE_DIR}/ATen/core")
+FOREACH(HEADER ${core_generated_h})
+  message("core header install: ${HEADER}")
+  INSTALL(FILES ${HEADER} DESTINATION ${AT_INSTALL_INCLUDE_DIR}/ATen/core)
+ENDFOREACH()
+
 INSTALL(FILES ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
   DESTINATION ${AT_INSTALL_SHARE_DIR}/ATen)
 
diff --git a/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp b/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
deleted file mode 100644
index e06b796202c1e..0000000000000
--- a/aten/src/ATen/core/OpsAlreadyMovedToC10.cpp
+++ /dev/null
@@ -1,1493 +0,0 @@
-#include <ATen/core/OpsAlreadyMovedToC10.h>
-#include <ATen/core/EnableNamedTensor.h>
-
-#include <string>
-#include <cstring>
-#include <utility>
-#include <unordered_set>
-#include <ATen/core/operator_name.h>
-
-// @generated by aten/src/ATen/gen.py
-
-// TODO Once all ATen ops are moved to c10, this file should be removed
-
-namespace at {
-
-namespace {
-struct OpNameEquals final {
-  bool operator()(const std::pair<const char*, const char*>& lhs, const std::pair<const char*, const char*>& rhs) const {
-      return 0 == strcmp(lhs.first, rhs.first) && 0 == strcmp(lhs.second, rhs.second);
-  }
-};
-
-struct OpNameHash final {
-  size_t operator()(const std::pair<const char*, const char*>& p) const {
-      // use std::hash<std::string> because std::hash<const char*> would hash pointers and not pointed-to strings
-      return std::hash<std::string>()(p.first) ^ (~ std::hash<std::string>()(p.second));
-  }
-};
-}
-
-bool aten_op_is_already_moved_to_c10(const c10::OperatorName& opName) {
-  static std::unordered_set<std::pair<const char*, const char*>, OpNameHash, OpNameEquals> ops {
-        {"aten::_cast_Byte", ""},
-        {"aten::_cast_Char", ""},
-        {"aten::_cast_Double", ""},
-        {"aten::_cast_Float", ""},
-        {"aten::_cast_Int", ""},
-        {"aten::_cast_Long", ""},
-        {"aten::_cast_Short", ""},
-        {"aten::_cast_Half", ""},
-        {"aten::data", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::align_as", ""},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::align_tensors", ""},
-    #endif
-        {"aten::_cudnn_ctc_loss", ""},
-        {"aten::_cudnn_rnn_flatten_weight", ""},
-        {"aten::_debug_has_internal_overlap", ""},
-        {"aten::_fused_dropout", ""},
-        {"aten::_masked_scale", ""},
-        {"aten::_sobol_engine_ff_", ""},
-        {"aten::_sobol_engine_scramble_", ""},
-        {"aten::_sobol_engine_initialize_state_", ""},
-        {"aten::_reshape_from_tensor", ""},
-        {"aten::_shape_as_tensor", ""},
-        {"aten::dropout", ""},
-        {"aten::dropout_", ""},
-        {"aten::feature_dropout", ""},
-        {"aten::feature_dropout_", ""},
-        {"aten::alpha_dropout", ""},
-        {"aten::alpha_dropout_", ""},
-        {"aten::feature_alpha_dropout", ""},
-        {"aten::feature_alpha_dropout_", ""},
-        {"aten::abs", ""},
-        {"aten::abs_", ""},
-        {"aten::acos", ""},
-        {"aten::acos_", ""},
-        {"aten::avg_pool1d", ""},
-        {"aten::adaptive_avg_pool1d", ""},
-        {"aten::adaptive_max_pool1d", ""},
-        {"aten::add", "Tensor"},
-        {"aten::add_", "Tensor"},
-        {"aten::add", "Scalar"},
-        {"aten::add_", "Scalar"},
-        {"aten::addmv", ""},
-        {"aten::addmv_", ""},
-        {"aten::addr", ""},
-        {"aten::addr_", ""},
-        {"aten::affine_grid_generator", ""},
-        {"aten::affine_grid_generator_backward", ""},
-        {"aten::all", "dim"},
-        {"aten::allclose", ""},
-        {"aten::any", "dim"},
-        {"aten::_dim_arange", ""},
-        {"aten::argmax", ""},
-        {"aten::argmin", ""},
-        {"aten::as_strided", ""},
-        {"aten::as_strided_", ""},
-        {"aten::asin", ""},
-        {"aten::asin_", ""},
-        {"aten::atan", ""},
-        {"aten::atan_", ""},
-        {"aten::baddbmm", ""},
-        {"aten::baddbmm_", ""},
-        {"aten::_baddbmm_mkl_", ""},
-        {"aten::bernoulli", ""},
-        {"aten::bernoulli_", "Tensor"},
-        {"aten::bernoulli_", "float"},
-        {"aten::bernoulli", "p"},
-        {"aten::bitwise_not", ""},
-        {"aten::bitwise_not_", ""},
-        {"aten::logical_not", ""},
-        {"aten::logical_not_", ""},
-        {"aten::logical_xor", ""},
-        {"aten::logical_xor_", ""},
-        {"aten::bmm", ""},
-        {"aten::broadcast_tensors", ""},
-        {"aten::cat", ""},
-        {"aten::ceil", ""},
-        {"aten::ceil_", ""},
-        {"aten::chain_matmul", ""},
-        {"aten::chunk", ""},
-        {"aten::clamp", ""},
-        {"aten::clamp_", ""},
-        {"aten::clamp_max", ""},
-        {"aten::clamp_max_", ""},
-        {"aten::clamp_min", ""},
-        {"aten::clamp_min_", ""},
-        {"aten::cudnn_is_acceptable", ""},
-        {"aten::constant_pad_nd", ""},
-        {"aten::conv_tbc", ""},
-        {"aten::conv_tbc_backward", ""},
-        {"aten::copy_", ""},
-        {"aten::_copy_from", ""},
-        {"aten::cos", ""},
-        {"aten::cos_", ""},
-        {"aten::cosh", ""},
-        {"aten::cosh_", ""},
-        {"aten::cosine_embedding_loss", ""},
-        {"aten::cudnn_affine_grid_generator", ""},
-        {"aten::cudnn_affine_grid_generator_backward", ""},
-        {"aten::cudnn_convolution_backward_input", ""},
-        {"aten::cudnn_convolution_backward", ""},
-        {"aten::cudnn_convolution_backward_bias", ""},
-        {"aten::cudnn_convolution_backward_weight", ""},
-        {"aten::cudnn_convolution_transpose_backward", ""},
-        {"aten::cudnn_convolution_transpose_backward_bias", ""},
-        {"aten::cudnn_convolution_transpose_backward_input", ""},
-        {"aten::cudnn_convolution_transpose_backward_weight", ""},
-        {"aten::cudnn_grid_sampler", ""},
-        {"aten::cudnn_grid_sampler_backward", ""},
-        {"aten::ctc_loss", "IntList"},
-        {"aten::ctc_loss", "Tensor"},
-        {"aten::_ctc_loss", ""},
-        {"aten::_ctc_loss_backward", ""},
-        {"aten::det", ""},
-        {"aten::diag_embed", ""},
-        {"aten::diagflat", ""},
-        {"aten::diagonal", ""},
-        {"aten::fill_diagonal_", ""},
-        {"aten::div", "Tensor"},
-        {"aten::div_", "Tensor"},
-        {"aten::div", "Scalar"},
-        {"aten::div_", "Scalar"},
-        {"aten::dot", ""},
-        {"aten::einsum", ""},
-        {"aten::embedding", ""},
-        {"aten::embedding_backward", ""},
-        {"aten::embedding_dense_backward", ""},
-        {"aten::embedding_renorm_", ""},
-        {"aten::embedding_sparse_backward", ""},
-        {"aten::_embedding_bag_per_sample_weights_backward", ""},
-        {"aten::resize_", ""},
-        {"aten::empty_like", ""},
-        {"aten::erf", ""},
-        {"aten::erf_", ""},
-        {"aten::erfc", ""},
-        {"aten::erfc_", ""},
-        {"aten::exp", ""},
-        {"aten::exp_", ""},
-        {"aten::expm1", ""},
-        {"aten::expm1_", ""},
-        {"aten::expand", ""},
-        {"aten::expand_as", ""},
-        {"aten::flatten", "using_ints"},
-        {"aten::fill_", "Scalar"},
-        {"aten::fill_", "Tensor"},
-        {"aten::floor", ""},
-        {"aten::floor_", ""},
-        {"aten::frac", ""},
-        {"aten::frac_", ""},
-        {"aten::full_like", ""},
-        {"aten::grid_sampler", ""},
-        {"aten::grid_sampler_2d", ""},
-        {"aten::grid_sampler_2d_backward", ""},
-        {"aten::grid_sampler_3d", ""},
-        {"aten::grid_sampler_3d_backward", ""},
-        {"aten::hinge_embedding_loss", ""},
-        {"aten::ger", ""},
-        {"aten::fft", ""},
-        {"aten::ifft", ""},
-        {"aten::rfft", ""},
-        {"aten::irfft", ""},
-        {"aten::_fft_with_size", ""},
-        {"aten::_cufft_get_plan_cache_size", ""},
-        {"aten::_cufft_get_plan_cache_max_size", ""},
-        {"aten::index_copy_", ""},
-        {"aten::index_copy", ""},
-        {"aten::inverse", ""},
-        {"aten::_inverse_helper", ""},
-        {"aten::isclose", ""},
-        {"aten::isnan", ""},
-        {"aten::is_distributed", ""},
-        {"aten::is_floating_point", ""},
-        {"aten::is_complex", ""},
-        {"aten::is_nonzero", ""},
-        {"aten::is_same_size", ""},
-        {"aten::is_signed", ""},
-        {"aten::kl_div", ""},
-        {"aten::kl_div_backward", ""},
-        {"aten::kthvalue", ""},
-        {"aten::fbgemm_linear_int8_weight_fp32_activation", ""},
-        {"aten::fbgemm_linear_int8_weight", ""},
-        {"aten::fbgemm_linear_quantize_weight", ""},
-        {"aten::fbgemm_pack_gemm_matrix_fp16", ""},
-        {"aten::fbgemm_linear_fp16_weight_fp32_activation", ""},
-        {"aten::fbgemm_linear_fp16_weight", ""},
-        {"aten::fbgemm_pack_quantized_matrix", ""},
-        {"aten::fbgemm_pack_quantized_matrix", "KN"},
-        {"aten::log", ""},
-        {"aten::log_", ""},
-        {"aten::log10", ""},
-        {"aten::log10_", ""},
-        {"aten::log1p", ""},
-        {"aten::log1p_", ""},
-        {"aten::log2", ""},
-        {"aten::log2_", ""},
-        {"aten::logdet", ""},
-        {"aten::_log_softmax", ""},
-        {"aten::_log_softmax_backward_data", ""},
-        {"aten::logsumexp", ""},
-        {"aten::margin_ranking_loss", ""},
-        {"aten::matmul", ""},
-        {"aten::matrix_rank", "tol"},
-        {"aten::matrix_rank", ""},
-        {"aten::matrix_power", ""},
-        {"aten::max", "dim"},
-        {"aten::max_values", ""},
-        {"aten::max_pool1d_with_indices", ""},
-        {"aten::max_pool1d", ""},
-        {"aten::max_pool2d", ""},
-        {"aten::mkldnn_max_pool2d", ""},
-        {"aten::quantized_max_pool2d", ""},
-        {"aten::max_pool3d", ""},
-        {"aten::median", "dim"},
-        {"aten::min", "dim"},
-        {"aten::min_values", ""},
-        {"aten::mkldnn_convolution_backward_input", ""},
-        {"aten::mkldnn_convolution_backward_weights", ""},
-        {"aten::mkldnn_convolution_backward", ""},
-        {"aten::miopen_convolution_backward_input", ""},
-        {"aten::miopen_convolution_backward", ""},
-        {"aten::miopen_convolution_backward_bias", ""},
-        {"aten::miopen_convolution_backward_weight", ""},
-        {"aten::miopen_convolution_transpose_backward", ""},
-        {"aten::miopen_convolution_transpose_backward_input", ""},
-        {"aten::miopen_convolution_transpose_backward_weight", ""},
-        {"aten::miopen_depthwise_convolution_backward_input", ""},
-        {"aten::miopen_depthwise_convolution_backward", ""},
-        {"aten::miopen_depthwise_convolution_backward_weight", ""},
-        {"aten::mm", ""},
-        {"aten::_sparse_mm", ""},
-        {"aten::mode", ""},
-        {"aten::mul", "Tensor"},
-        {"aten::mul_", "Tensor"},
-        {"aten::mul", "Scalar"},
-        {"aten::mul_", "Scalar"},
-        {"aten::mv", ""},
-        {"aten::mvlgamma", ""},
-        {"aten::mvlgamma_", ""},
-        {"aten::narrow_copy", ""},
-        {"aten::narrow", ""},
-        {"aten::batch_norm_stats", ""},
-        {"aten::_nnpack_available", ""},
-        {"aten::_nnpack_spatial_convolution_backward", ""},
-        {"aten::_nnpack_spatial_convolution_backward_input", ""},
-        {"aten::_nnpack_spatial_convolution_backward_weight", ""},
-        {"aten::ones_like", ""},
-        {"aten::pairwise_distance", ""},
-        {"aten::cdist", ""},
-        {"aten::_cdist_backward", ""},
-        {"aten::pdist", ""},
-        {"aten::_pdist_forward", ""},
-        {"aten::_pdist_backward", ""},
-        {"aten::cosine_similarity", ""},
-        {"aten::permute", ""},
-        {"aten::numpy_T", ""},
-        {"aten::pixel_shuffle", ""},
-        {"aten::is_pinned", ""},
-        {"aten::pin_memory", ""},
-        {"aten::pinverse", ""},
-        {"aten::poisson_nll_loss", ""},
-        {"aten::rand_like", ""},
-        {"aten::randint_like", ""},
-        {"aten::randint_like", "low"},
-        {"aten::randn_like", ""},
-        {"aten::reciprocal", ""},
-        {"aten::reciprocal_", ""},
-        {"aten::neg", ""},
-        {"aten::neg_", ""},
-        {"aten::repeat", ""},
-        {"aten::repeat_interleave", "Tensor"},
-        {"aten::repeat_interleave", "self_Tensor"},
-        {"aten::repeat_interleave", "self_int"},
-        {"aten::reshape", ""},
-        {"aten::_mkldnn_reshape", ""},
-        {"aten::reshape_as", ""},
-        {"aten::round", ""},
-        {"aten::round_", ""},
-        {"aten::rrelu", ""},
-        {"aten::rrelu_", ""},
-        {"aten::relu", ""},
-        {"aten::relu_", ""},
-        {"aten::prelu", ""},
-        {"aten::prelu_backward", ""},
-        {"aten::gelu", ""},
-        {"aten::gelu_backward", ""},
-        {"aten::hardshrink", ""},
-        {"aten::hardshrink_backward", ""},
-        {"aten::rsqrt", ""},
-        {"aten::rsqrt_", ""},
-        {"aten::select", "int"},
-        {"aten::selu", ""},
-        {"aten::selu_", ""},
-        {"aten::celu", ""},
-        {"aten::celu_", ""},
-        {"aten::sigmoid", ""},
-        {"aten::sigmoid_", ""},
-        {"aten::sin", ""},
-        {"aten::sin_", ""},
-        {"aten::sinh", ""},
-        {"aten::sinh_", ""},
-        {"aten::detach", ""},
-        {"aten::detach_", ""},
-        {"aten::size", "int"},
-        {"aten::slice", "Tensor"},
-        {"aten::slogdet", ""},
-        {"aten::smm", ""},
-        {"aten::_softmax", ""},
-        {"aten::_softmax_backward_data", ""},
-        {"aten::split", "Tensor"},
-        {"aten::split_with_sizes", ""},
-        {"aten::squeeze", ""},
-        {"aten::squeeze", "dim"},
-        {"aten::squeeze_", ""},
-        {"aten::squeeze_", "dim"},
-        {"aten::sspaddmm", ""},
-        {"aten::stack", ""},
-        {"aten::stride", "int"},
-        {"aten::sum_to_size", ""},
-        {"aten::sqrt", ""},
-        {"aten::sqrt_", ""},
-        {"aten::std", ""},
-        {"aten::std", "dim"},
-        {"aten::std_mean", ""},
-        {"aten::std_mean", "dim"},
-        {"aten::t", ""},
-        {"aten::t_", ""},
-        {"aten::tan", ""},
-        {"aten::tan_", ""},
-        {"aten::tanh", ""},
-        {"aten::tanh_", ""},
-        {"aten::tensordot", ""},
-        {"aten::threshold", ""},
-        {"aten::threshold_", ""},
-        {"aten::threshold_backward", ""},
-        {"aten::transpose", "int"},
-        {"aten::_mkldnn_transpose", ""},
-        {"aten::transpose_", ""},
-        {"aten::_mkldnn_transpose_", ""},
-        {"aten::one_hot", ""},
-        {"aten::flip", ""},
-        {"aten::roll", ""},
-        {"aten::rot90", ""},
-        {"aten::trapz", "x"},
-        {"aten::trapz", "dx"},
-        {"aten::_trilinear", ""},
-        {"aten::triplet_margin_loss", ""},
-        {"aten::trunc", ""},
-        {"aten::trunc_", ""},
-        {"aten::type_as", ""},
-        {"aten::_has_compatible_shallow_copy_type", ""},
-        {"aten::_unique", ""},
-        {"aten::unique_dim", ""},
-        {"aten::unique_consecutive", ""},
-        {"aten::unique_dim_consecutive", ""},
-        {"aten::_unique2", ""},
-        {"aten::_unsafe_view", ""},
-        {"aten::unsqueeze", ""},
-        {"aten::unsqueeze_", ""},
-        {"aten::var", ""},
-        {"aten::var", "dim"},
-        {"aten::var_mean", ""},
-        {"aten::var_mean", "dim"},
-        {"aten::view_as", ""},
-        {"aten::where", "self"},
-        {"aten::where", ""},
-        {"aten::_s_where", ""},
-        {"aten::norm_except_dim", ""},
-        {"aten::_weight_norm", ""},
-        {"aten::_weight_norm_cuda_interface", ""},
-        {"aten::_weight_norm_cuda_interface_backward", ""},
-        {"aten::_weight_norm_differentiable_backward", ""},
-        {"aten::zeros_like", ""},
-        {"aten::_standard_gamma_grad", ""},
-        {"aten::_standard_gamma", ""},
-        {"aten::_dirichlet_grad", ""},
-        {"aten::_sample_dirichlet", ""},
-        {"aten::poisson", ""},
-        {"aten::native_norm", ""},
-        {"aten::_sparse_sum", ""},
-        {"aten::_sparse_sum", "dim"},
-        {"aten::_sparse_sum_backward", ""},
-        {"aten::norm", "Scalar"},
-        {"aten::norm", "ScalarOpt_dim"},
-        {"aten::frobenius_norm", ""},
-        {"aten::frobenius_norm", "dim"},
-        {"aten::nuclear_norm", ""},
-        {"aten::nuclear_norm", "dim"},
-        {"aten::resize_as_", ""},
-        {"aten::pow", "Tensor_Scalar"},
-        {"aten::zero_", ""},
-        {"aten::sub", "Tensor"},
-        {"aten::sub_", "Tensor"},
-        {"aten::sub", "Scalar"},
-        {"aten::sub_", "Scalar"},
-        {"aten::rsub", "Tensor"},
-        {"aten::rsub", "Scalar"},
-        {"aten::_sparse_addmm", ""},
-        {"aten::addmm", ""},
-        {"aten::addmm_", ""},
-        {"aten::sparse_resize_", ""},
-        {"aten::sparse_resize_and_clear_", ""},
-        {"aten::sparse_mask", ""},
-        {"aten::to_dense", ""},
-        {"aten::to_dense_backward", ""},
-        {"aten::sparse_dim", ""},
-        {"aten::_dimI", ""},
-        {"aten::dense_dim", ""},
-        {"aten::_dimV", ""},
-        {"aten::_nnz", ""},
-        {"aten::coalesce", ""},
-        {"aten::is_coalesced", ""},
-        {"aten::_indices", ""},
-        {"aten::_values", ""},
-        {"aten::_coalesced_", ""},
-        {"aten::indices", ""},
-        {"aten::values", ""},
-        {"aten::hspmm", ""},
-        {"aten::copy_sparse_to_sparse_", ""},
-        {"aten::numel", ""},
-        {"aten::unbind", "int"},
-        {"aten::to_sparse", "sparse_dim"},
-        {"aten::to_sparse", ""},
-        {"aten::to_mkldnn", ""},
-        {"aten::mkldnn_reorder_conv2d_weight", ""},
-        {"aten::to_mkldnn_backward", ""},
-        {"aten::dequantize", ""},
-        {"aten::q_scale", ""},
-        {"aten::q_zero_point", ""},
-        {"aten::q_per_channel_scales", ""},
-        {"aten::q_per_channel_zero_points", ""},
-        {"aten::int_repr", ""},
-        {"aten::_make_per_tensor_quantized_tensor", ""},
-        {"aten::_make_per_channel_quantized_tensor", ""},
-        {"aten::fake_quantize_per_tensor_affine", ""},
-        {"aten::fake_quantize_per_tensor_affine_backward", ""},
-        {"aten::to", "other"},
-        {"aten::meshgrid", ""},
-        {"aten::cartesian_prod", ""},
-        {"aten::combinations", ""},
-        {"aten::item", ""},
-        {"aten::_local_scalar_dense", ""},
-        {"aten::_thnn_fused_gru_cell_backward", ""},
-        {"aten::lstm", "input"},
-        {"aten::lstm", "data"},
-        {"aten::gru", "input"},
-        {"aten::gru", "data"},
-        {"aten::rnn_tanh", "input"},
-        {"aten::rnn_tanh", "data"},
-        {"aten::rnn_relu", "input"},
-        {"aten::rnn_relu", "data"},
-        {"aten::quantized_gru", "input"},
-        {"aten::quantized_gru", "data"},
-        {"aten::quantized_lstm_cell", ""},
-        {"aten::quantized_gru_cell", ""},
-        {"aten::quantized_rnn_relu_cell", ""},
-        {"aten::quantized_rnn_tanh_cell", ""},
-        {"aten::_pack_padded_sequence", ""},
-        {"aten::_pack_padded_sequence_backward", ""},
-        {"aten::_pad_packed_sequence", ""},
-        {"aten::set_", "source_Tensor"},
-        {"aten::set_", ""},
-        {"aten::is_set_to", ""},
-        {"aten::masked_fill_", "Scalar"},
-        {"aten::masked_fill", "Scalar"},
-        {"aten::masked_fill_", "Tensor"},
-        {"aten::masked_fill", "Tensor"},
-        {"aten::masked_scatter_", ""},
-        {"aten::masked_scatter", ""},
-        {"aten::view", ""},
-        {"aten::put_", ""},
-        {"aten::index_add_", ""},
-        {"aten::index_add", ""},
-        {"aten::index_fill_", "Scalar"},
-        {"aten::index_fill", "Scalar"},
-        {"aten::index_fill_", "Tensor"},
-        {"aten::index_fill", "Tensor"},
-        {"aten::scatter_", "src"},
-        {"aten::scatter", "src"},
-        {"aten::scatter_", "value"},
-        {"aten::scatter", "value"},
-        {"aten::scatter_add_", ""},
-        {"aten::scatter_add", ""},
-        {"aten::lt_", "Scalar"},
-        {"aten::lt_", "Tensor"},
-        {"aten::gt_", "Scalar"},
-        {"aten::gt_", "Tensor"},
-        {"aten::le_", "Scalar"},
-        {"aten::le_", "Tensor"},
-        {"aten::ge_", "Scalar"},
-        {"aten::ge_", "Tensor"},
-        {"aten::eq_", "Scalar"},
-        {"aten::eq_", "Tensor"},
-        {"aten::ne_", "Scalar"},
-        {"aten::ne_", "Tensor"},
-        {"aten::__and__", "Scalar"},
-        {"aten::__and__", "Tensor"},
-        {"aten::__iand__", "Scalar"},
-        {"aten::__iand__", "Tensor"},
-        {"aten::__or__", "Scalar"},
-        {"aten::__or__", "Tensor"},
-        {"aten::__ior__", "Scalar"},
-        {"aten::__ior__", "Tensor"},
-        {"aten::__xor__", "Scalar"},
-        {"aten::__xor__", "Tensor"},
-        {"aten::__ixor__", "Scalar"},
-        {"aten::__ixor__", "Tensor"},
-        {"aten::__lshift__", "Scalar"},
-        {"aten::__lshift__", "Tensor"},
-        {"aten::__ilshift__", "Scalar"},
-        {"aten::__ilshift__", "Tensor"},
-        {"aten::__rshift__", "Scalar"},
-        {"aten::__rshift__", "Tensor"},
-        {"aten::__irshift__", "Scalar"},
-        {"aten::__irshift__", "Tensor"},
-        {"aten::lgamma_", ""},
-        {"aten::atan2_", ""},
-        {"aten::tril_", ""},
-        {"aten::triu_", ""},
-        {"aten::digamma_", ""},
-        {"aten::polygamma_", ""},
-        {"aten::renorm_", ""},
-        {"aten::pow_", "Scalar"},
-        {"aten::pow_", "Tensor"},
-        {"aten::lerp_", "Scalar"},
-        {"aten::lerp_", "Tensor"},
-        {"aten::fmod_", "Scalar"},
-        {"aten::fmod_", "Tensor"},
-        {"aten::remainder_", "Scalar"},
-        {"aten::remainder_", "Tensor"},
-        {"aten::addbmm_", ""},
-        {"aten::addbmm", ""},
-        {"aten::addcdiv_", ""},
-        {"aten::random_", "from"},
-        {"aten::random_", "to"},
-        {"aten::random_", ""},
-        {"aten::uniform_", ""},
-        {"aten::normal_", ""},
-        {"aten::cauchy_", ""},
-        {"aten::log_normal_", ""},
-        {"aten::exponential_", ""},
-        {"aten::geometric_", ""},
-        {"aten::diag", ""},
-        {"aten::cross", ""},
-        {"aten::triu", ""},
-        {"aten::tril", ""},
-        {"aten::trace", ""},
-        {"aten::ne", "Scalar"},
-        {"aten::ne", "Tensor"},
-        {"aten::eq", "Scalar"},
-        {"aten::eq", "Tensor"},
-        {"aten::ge", "Scalar"},
-        {"aten::ge", "Tensor"},
-        {"aten::le", "Scalar"},
-        {"aten::le", "Tensor"},
-        {"aten::gt", "Scalar"},
-        {"aten::gt", "Tensor"},
-        {"aten::lt", "Scalar"},
-        {"aten::lt", "Tensor"},
-        {"aten::take", ""},
-        {"aten::index_select", ""},
-        {"aten::masked_select", ""},
-        {"aten::nonzero", ""},
-        {"aten::nonzero_numpy", ""},
-        {"aten::gather", ""},
-        {"aten::_gather_sparse_backward", ""},
-        {"aten::addcmul", ""},
-        {"aten::addcmul_", ""},
-        {"aten::addcdiv", ""},
-        {"aten::lstsq", ""},
-        {"aten::triangular_solve", ""},
-        {"aten::_triangular_solve_helper", ""},
-        {"aten::symeig", ""},
-        {"aten::_symeig_helper", ""},
-        {"aten::eig", ""},
-        {"aten::svd", ""},
-        {"aten::_svd_helper", ""},
-        {"aten::cholesky", ""},
-        {"aten::_cholesky_helper", ""},
-        {"aten::cholesky_solve", ""},
-        {"aten::_cholesky_solve_helper", ""},
-        {"aten::solve", ""},
-        {"aten::_solve_helper", ""},
-        {"aten::cholesky_inverse", ""},
-        {"aten::qr", ""},
-        {"aten::_qr_helper", ""},
-        {"aten::geqrf", ""},
-        {"aten::orgqr", ""},
-        {"aten::ormqr", ""},
-        {"aten::_lu_with_info", ""},
-        {"aten::lu_solve", ""},
-        {"aten::_lu_solve_helper", ""},
-        {"aten::multinomial", ""},
-        {"aten::_multinomial_alias_setup", ""},
-        {"aten::_multinomial_alias_draw", ""},
-        {"aten::lgamma", ""},
-        {"aten::digamma", ""},
-        {"aten::polygamma", ""},
-        {"aten::erfinv", ""},
-        {"aten::erfinv_", ""},
-        {"aten::sign", ""},
-        {"aten::sign_", ""},
-        {"aten::dist", ""},
-        {"aten::atan2", ""},
-        {"aten::lerp", "Scalar"},
-        {"aten::lerp", "Tensor"},
-        {"aten::histc", ""},
-        {"aten::fmod", "Scalar"},
-        {"aten::fmod", "Tensor"},
-        {"aten::remainder", "Scalar"},
-        {"aten::remainder", "Tensor"},
-        {"aten::min", "other"},
-        {"aten::min", ""},
-        {"aten::max", "other"},
-        {"aten::max", ""},
-        {"aten::median", ""},
-        {"aten::sort", ""},
-        {"aten::argsort", ""},
-        {"aten::topk", ""},
-        {"aten::all", ""},
-        {"aten::any", ""},
-        {"aten::renorm", ""},
-        {"aten::unfold", ""},
-        {"aten::equal", ""},
-        {"aten::pow", "Tensor_Tensor"},
-        {"aten::pow", "Scalar"},
-        {"aten::normal", "Tensor_float"},
-        {"aten::normal", "float_Tensor"},
-        {"aten::normal", "Tensor_Tensor"},
-        {"aten::alias", ""},
-        {"aten::_addr", ""},
-        {"aten::_addr_", ""},
-        {"aten::_index_copy_", ""},
-        {"aten::_cumsum", ""},
-        {"aten::_cumprod", ""},
-        {"aten::_var", ""},
-        {"aten::_std", ""},
-        {"aten::_cat", ""},
-        {"aten::_mode", ""},
-        {"aten::_max", ""},
-        {"aten::_min", ""},
-        {"aten::mse_loss", ""},
-        {"aten::mse_loss_backward", ""},
-        {"aten::l1_loss", ""},
-        {"aten::l1_loss_backward", ""},
-        {"aten::multilabel_margin_loss", ""},
-        {"aten::multilabel_margin_loss_forward", ""},
-        {"aten::multilabel_margin_loss_backward", ""},
-        {"aten::smooth_l1_loss", ""},
-        {"aten::smooth_l1_loss_backward", ""},
-        {"aten::soft_margin_loss", ""},
-        {"aten::soft_margin_loss_backward", ""},
-        {"aten::elu", ""},
-        {"aten::elu_backward", ""},
-        {"aten::elu_", ""},
-        {"aten::glu", ""},
-        {"aten::glu_backward", ""},
-        {"aten::hardtanh", ""},
-        {"aten::hardtanh_backward", ""},
-        {"aten::hardtanh_", ""},
-        {"aten::leaky_relu", ""},
-        {"aten::leaky_relu_backward", ""},
-        {"aten::leaky_relu_", ""},
-        {"aten::log_sigmoid", ""},
-        {"aten::log_sigmoid_forward", ""},
-        {"aten::log_sigmoid_backward", ""},
-        {"aten::rrelu_with_noise", ""},
-        {"aten::rrelu_with_noise_backward", ""},
-        {"aten::rrelu_with_noise_", ""},
-        {"aten::softplus", ""},
-        {"aten::softplus_backward", ""},
-        {"aten::softshrink", ""},
-        {"aten::softshrink_backward", ""},
-        {"aten::adaptive_avg_pool2d", ""},
-        {"aten::mkldnn_adaptive_avg_pool2d", ""},
-        {"aten::_adaptive_avg_pool2d", ""},
-        {"aten::_adaptive_avg_pool2d_backward", ""},
-        {"aten::adaptive_avg_pool3d", ""},
-        {"aten::adaptive_avg_pool3d_backward", ""},
-        {"aten::adaptive_max_pool2d", ""},
-        {"aten::adaptive_max_pool2d_backward", ""},
-        {"aten::adaptive_max_pool3d", ""},
-        {"aten::adaptive_max_pool3d_backward", ""},
-        {"aten::avg_pool2d", ""},
-        {"aten::avg_pool2d_backward", ""},
-        {"aten::avg_pool3d", ""},
-        {"aten::avg_pool3d_backward", ""},
-        {"aten::fractional_max_pool2d", ""},
-        {"aten::fractional_max_pool2d_backward", ""},
-        {"aten::fractional_max_pool3d", ""},
-        {"aten::fractional_max_pool3d_backward", ""},
-        {"aten::max_pool2d_with_indices", ""},
-        {"aten::max_pool2d_with_indices_backward", ""},
-        {"aten::max_pool3d_with_indices", ""},
-        {"aten::max_pool3d_with_indices_backward", ""},
-        {"aten::max_unpool2d", ""},
-        {"aten::max_unpool2d_backward", ""},
-        {"aten::max_unpool3d", ""},
-        {"aten::max_unpool3d_backward", ""},
-        {"aten::reflection_pad1d", ""},
-        {"aten::reflection_pad1d_backward", ""},
-        {"aten::reflection_pad2d", ""},
-        {"aten::reflection_pad2d_backward", ""},
-        {"aten::replication_pad1d", ""},
-        {"aten::replication_pad1d_backward", ""},
-        {"aten::replication_pad2d", ""},
-        {"aten::replication_pad2d_backward", ""},
-        {"aten::replication_pad3d", ""},
-        {"aten::replication_pad3d_backward", ""},
-        {"aten::upsample_linear1d", ""},
-        {"aten::upsample_linear1d_backward", ""},
-        {"aten::upsample_bilinear2d", ""},
-        {"aten::upsample_bilinear2d_backward", ""},
-        {"aten::upsample_bicubic2d", ""},
-        {"aten::upsample_bicubic2d_backward", ""},
-        {"aten::upsample_trilinear3d", ""},
-        {"aten::upsample_trilinear3d_backward", ""},
-        {"aten::upsample_nearest1d", ""},
-        {"aten::upsample_nearest1d_backward", ""},
-        {"aten::upsample_nearest2d", ""},
-        {"aten::upsample_nearest2d_backward", ""},
-        {"aten::upsample_nearest3d", ""},
-        {"aten::upsample_nearest3d_backward", ""},
-        {"aten::sigmoid_backward", ""},
-        {"aten::tanh_backward", ""},
-        {"aten::slow_conv_transpose2d_backward", "output_mask"},
-        {"aten::slow_conv_transpose3d_backward", "output_mask"},
-        {"aten::thnn_conv2d_backward", "output_mask"},
-        {"aten::thnn_conv_depthwise2d_backward", "output_mask"},
-        {"aten::thnn_conv3d_backward", "output_mask"},
-        {"aten::slow_conv_dilated2d_backward", ""},
-        {"aten::slow_conv_dilated3d_backward", ""},
-        {"aten::col2im", ""},
-        {"aten::col2im_backward", ""},
-        {"aten::im2col", ""},
-        {"aten::im2col_backward", ""},
-    {"", ""}
-  };
-  return ops.count(std::make_pair(opName.name.c_str(), opName.overload_name.c_str())) != 0;
-}
-
-bool aten_op_is_not_moved_to_c10_yet(const c10::OperatorName& opName) {
-  static std::unordered_set<std::pair<const char*, const char*>, OpNameHash, OpNameEquals> ops {
-        {"aten::backward", ""},
-        {"aten::set_data", ""},
-        {"aten::is_leaf", ""},
-        {"aten::output_nr", ""},
-        {"aten::_version", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::rename_", ""},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::rename", ""},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::align_to", ""},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::refine_names", ""},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::unflatten", ""},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::unflatten", ""},
-    #endif
-        {"aten::_cudnn_rnn", ""},
-        {"aten::_cudnn_rnn_backward", ""},
-        {"aten::_cudnn_init_dropout_state", ""},
-        {"aten::_sobol_engine_draw", ""},
-        {"aten::abs", "out"},
-        {"aten::acos", "out"},
-        {"aten::add", "out"},
-        {"aten::addmv", "out"},
-        {"aten::addr", "out"},
-        {"aten::all", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::all", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::all", "dimname_out"},
-    #endif
-        {"aten::any", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::any", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::any", "dimname_out"},
-    #endif
-        {"aten::arange", ""},
-        {"aten::arange", "start"},
-        {"aten::arange", "start_step"},
-        {"aten::arange", "out"},
-        {"aten::arange", "start_out"},
-        {"aten::asin", "out"},
-        {"aten::atan", "out"},
-        {"aten::baddbmm", "out"},
-        {"aten::bartlett_window", ""},
-        {"aten::bartlett_window", "periodic"},
-        {"aten::batch_norm", ""},
-        {"aten::_batch_norm_impl_index", ""},
-        {"aten::_batch_norm_impl_index_backward", ""},
-        {"aten::bernoulli", "out"},
-        {"aten::bilinear", ""},
-        {"aten::binary_cross_entropy_with_logits", ""},
-        {"aten::binary_cross_entropy_with_logits_backward", ""},
-        {"aten::bincount", ""},
-        {"aten::bitwise_not", "out"},
-        {"aten::logical_not", "out"},
-        {"aten::logical_xor", "out"},
-        {"aten::blackman_window", ""},
-        {"aten::blackman_window", "periodic"},
-        {"aten::bmm", "out"},
-        {"aten::cat", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::cat", "names"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::cat", "names_out"},
-    #endif
-        {"aten::ceil", "out"},
-        {"aten::clamp", "out"},
-        {"aten::clamp_max", "out"},
-        {"aten::clamp_min", "out"},
-        {"aten::contiguous", ""},
-        {"aten::convolution", ""},
-        {"aten::convolution_overrideable", ""},
-        {"aten::convolution_backward_overrideable", ""},
-        {"aten::_convolution", ""},
-        {"aten::_convolution_nogroup", ""},
-        {"aten::_convolution_double_backward", ""},
-        {"aten::conv1d", ""},
-        {"aten::conv2d", ""},
-        {"aten::conv3d", ""},
-        {"aten::conv_transpose1d", ""},
-        {"aten::conv_transpose2d", "input"},
-        {"aten::conv_transpose3d", "input"},
-        {"aten::cos", "out"},
-        {"aten::cosh", "out"},
-        {"aten::cudnn_batch_norm", ""},
-        {"aten::cudnn_batch_norm_backward", ""},
-        {"aten::cudnn_convolution", ""},
-        {"aten::cudnn_convolution_transpose", ""},
-        {"aten::cumsum", ""},
-        {"aten::cumsum", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::cumsum", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::cumsum", "dimname_out"},
-    #endif
-        {"aten::cumprod", ""},
-        {"aten::cumprod", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::cumprod", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::cumprod", "dimname_out"},
-    #endif
-        {"aten::div", "out"},
-        {"aten::dot", "out"},
-        {"aten::embedding_bag", ""},
-        {"aten::_embedding_bag", ""},
-        {"aten::_embedding_bag_backward", ""},
-        {"aten::_embedding_bag_sparse_backward", ""},
-        {"aten::_embedding_bag_dense_backward", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::empty", "names"},
-    #endif
-        {"aten::empty", "memory_format"},
-        {"aten::new_empty", ""},
-        {"aten::new_full", ""},
-        {"aten::new_zeros", ""},
-        {"aten::_empty_affine_quantized", ""},
-        {"aten::_empty_per_channel_affine_quantized", ""},
-        {"aten::empty", "out"},
-        {"aten::empty_like", "dtype"},
-        {"aten::empty_strided", ""},
-        {"aten::erf", "out"},
-        {"aten::erfc", "out"},
-        {"aten::exp", "out"},
-        {"aten::expm1", "out"},
-        {"aten::eye", ""},
-        {"aten::eye", "m"},
-        {"aten::eye", "out"},
-        {"aten::eye", "m_out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::flatten", "named_out_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::flatten", "using_names"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::flatten", "DimnameList"},
-    #endif
-        {"aten::floor", "out"},
-        {"aten::frac", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::full", "names"},
-    #endif
-        {"aten::full", ""},
-        {"aten::full", "out"},
-        {"aten::full_like", "dtype"},
-        {"aten::from_file", ""},
-        {"aten::hann_window", ""},
-        {"aten::hann_window", "periodic"},
-        {"aten::hamming_window", ""},
-        {"aten::hamming_window", "periodic"},
-        {"aten::hamming_window", "periodic_alpha"},
-        {"aten::hamming_window", "periodic_alpha_beta"},
-        {"aten::ger", "out"},
-        {"aten::group_norm", ""},
-        {"aten::_cufft_set_plan_cache_max_size", ""},
-        {"aten::_cufft_clear_plan_cache", ""},
-        {"aten::index", "Tensor"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_copy_", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_copy", "dimname"},
-    #endif
-        {"aten::index_put_", ""},
-        {"aten::index_put", ""},
-        {"aten::_index_put_impl_", ""},
-        {"aten::instance_norm", ""},
-        {"aten::inverse", "out"},
-        {"aten::kthvalue", "values"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::kthvalue", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::kthvalue", "dimname_out"},
-    #endif
-        {"aten::layer_norm", ""},
-        {"aten::native_layer_norm", ""},
-        {"aten::native_layer_norm_backward", ""},
-        {"aten::native_layer_norm_double_backward", ""},
-        {"aten::linear", ""},
-        {"aten::mkldnn_linear", ""},
-        {"aten::linspace", ""},
-        {"aten::linspace", "out"},
-        {"aten::log", "out"},
-        {"aten::log10", "out"},
-        {"aten::log1p", "out"},
-        {"aten::log2", "out"},
-        {"aten::logspace", ""},
-        {"aten::logspace", "out"},
-        {"aten::log_softmax", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::log_softmax", ""},
-    #endif
-        {"aten::logsumexp", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::logsumexp", "names"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::logsumexp", "names_out"},
-    #endif
-        {"aten::matmul", "out"},
-        {"aten::max", "dim_max"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::max", "names_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::max", "names_dim_max"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::max_values", "names"},
-    #endif
-        {"aten::mean", ""},
-        {"aten::mean", "dim"},
-        {"aten::mean", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::mean", "names_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::mean", "names_out"},
-    #endif
-        {"aten::median", "dim_values"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::median", "names_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::median", "names_dim_values"},
-    #endif
-        {"aten::min", "dim_min"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::min", "names_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::min", "names_dim_min"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::min_values", "names"},
-    #endif
-        {"aten::mkldnn_convolution", ""},
-        {"aten::miopen_batch_norm", ""},
-        {"aten::miopen_batch_norm_backward", ""},
-        {"aten::miopen_convolution", ""},
-        {"aten::miopen_convolution_transpose", ""},
-        {"aten::miopen_depthwise_convolution", ""},
-        {"aten::miopen_rnn", ""},
-        {"aten::miopen_rnn_backward", ""},
-        {"aten::mm", "out"},
-        {"aten::mode", "values"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::mode", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::mode", "dimname_out"},
-    #endif
-        {"aten::mul", "out"},
-        {"aten::mv", "out"},
-        {"aten::native_batch_norm", ""},
-        {"aten::batch_norm_elemt", ""},
-        {"aten::batch_norm_gather_stats", ""},
-        {"aten::batch_norm_gather_stats_with_counts", ""},
-        {"aten::native_batch_norm_backward", ""},
-        {"aten::batch_norm_backward_reduce", ""},
-        {"aten::batch_norm_backward_elemt", ""},
-        {"aten::batch_norm_update_stats", ""},
-        {"aten::_nnpack_spatial_convolution", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::ones", "names"},
-    #endif
-        {"aten::ones", ""},
-        {"aten::ones", "out"},
-        {"aten::ones_like", "dtype"},
-        {"aten::scalar_tensor", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::rand", "names"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::rand", "generator_with_names"},
-    #endif
-        {"aten::rand", ""},
-        {"aten::rand", "generator"},
-        {"aten::rand", "out"},
-        {"aten::rand", "generator_out"},
-        {"aten::rand_like", "dtype"},
-        {"aten::randint", ""},
-        {"aten::randint", "generator"},
-        {"aten::randint", "low"},
-        {"aten::randint", "low_generator"},
-        {"aten::randint", "out"},
-        {"aten::randint", "generator_out"},
-        {"aten::randint", "low_out"},
-        {"aten::randint", "low_generator_out"},
-        {"aten::randint_like", "dtype"},
-        {"aten::randint_like", "low_dtype"},
-        {"aten::randn", ""},
-        {"aten::randn", "generator"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::randn", "names"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::randn", "generator_with_names"},
-    #endif
-        {"aten::randn", "out"},
-        {"aten::randn", "generator_out"},
-        {"aten::randn_like", "dtype"},
-        {"aten::randperm", ""},
-        {"aten::randperm", "generator"},
-        {"aten::randperm", "out"},
-        {"aten::randperm", "generator_out"},
-        {"aten::range", "step"},
-        {"aten::range", ""},
-        {"aten::range", "out"},
-        {"aten::reciprocal", "out"},
-        {"aten::neg", "out"},
-        {"aten::round", "out"},
-        {"aten::rsqrt", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::select", "Dimname"},
-    #endif
-        {"aten::sigmoid", "out"},
-        {"aten::sin", "out"},
-        {"aten::sinh", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::size", "Dimname"},
-    #endif
-        {"aten::softmax", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::softmax", ""},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::squeeze", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::squeeze_", "dimname"},
-    #endif
-        {"aten::sspaddmm", "out"},
-        {"aten::stack", "out"},
-        {"aten::stft", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::stride", "Dimname"},
-    #endif
-        {"aten::sum", ""},
-        {"aten::sum", "dim_IntList"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::sum", "dim_DimnameList"},
-    #endif
-        {"aten::sum", "IntList_out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::sum", "DimnameList_out"},
-    #endif
-        {"aten::sqrt", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::std_mean", "names_dim"},
-    #endif
-        {"aten::std", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::std", "names_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::std", "names_out"},
-    #endif
-        {"aten::prod", ""},
-        {"aten::prod", "dim_int"},
-        {"aten::prod", "int_out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::prod", "dim_Dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::prod", "Dimname_out"},
-    #endif
-        {"aten::tan", "out"},
-        {"aten::tanh", "out"},
-        {"aten::threshold", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::transpose", "Dimname"},
-    #endif
-        {"aten::trunc", "out"},
-        {"aten::var", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::var", "names_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::var", "names_out"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::var_mean", "names_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::zeros", "names"},
-    #endif
-        {"aten::zeros", ""},
-        {"aten::zeros", "out"},
-        {"aten::zeros_like", "dtype"},
-        {"aten::_sparse_sum", "dtype"},
-        {"aten::_sparse_sum", "dim_dtype"},
-        {"aten::norm", "ScalarOpt_dtype"},
-        {"aten::norm", "ScalarOpt_dim_dtype"},
-        {"aten::norm", "dtype_out"},
-        {"aten::norm", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::norm", "names_ScalarOpt_dim_dtype"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::norm", "names_ScalarOpt_dim"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::norm", "names_dtype_out"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::norm", "names_out"},
-    #endif
-        {"aten::frobenius_norm", "out"},
-        {"aten::nuclear_norm", "out"},
-        {"aten::nuclear_norm", "dim_out"},
-        {"aten::clone", ""},
-        {"aten::pow", "Tensor_Scalar_out"},
-        {"aten::sub", "out"},
-        {"aten::addmm", "out"},
-        {"aten::sparse_coo_tensor", "size"},
-        {"aten::sparse_coo_tensor", "indices"},
-        {"aten::sparse_coo_tensor", "indices_size"},
-        {"aten::_sparse_coo_tensor_unsafe", ""},
-        {"aten::_sparse_coo_tensor_with_dims", ""},
-        {"aten::_sparse_coo_tensor_with_dims_and_tensors", ""},
-        {"aten::hspmm", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::unbind", "Dimname"},
-    #endif
-        {"aten::quantize_per_tensor", ""},
-        {"aten::quantize_per_channel", ""},
-        {"aten::q_per_channel_axis", ""},
-        {"aten::qscheme", ""},
-        {"aten::fake_quantize_per_channel_affine", ""},
-        {"aten::fake_quantize_per_channel_affine_backward", ""},
-        {"aten::to", "dtype_layout"},
-        {"aten::to", "device"},
-        {"aten::to", "dtype"},
-        {"aten::result_type", "Tensor"},
-        {"aten::result_type", "Scalar"},
-        {"aten::result_type", "Scalar_Tensor"},
-        {"aten::result_type", "Scalar_Scalar"},
-        {"aten::can_cast", ""},
-        {"aten::promote_types", ""},
-        {"aten::_thnn_fused_lstm_cell", ""},
-        {"aten::_thnn_fused_lstm_cell_backward", ""},
-        {"aten::_thnn_differentiable_lstm_cell_backward", ""},
-        {"aten::_thnn_fused_gru_cell", ""},
-        {"aten::_thnn_differentiable_gru_cell_backward", ""},
-        {"aten::lstm_cell", ""},
-        {"aten::gru_cell", ""},
-        {"aten::rnn_tanh_cell", ""},
-        {"aten::rnn_relu_cell", ""},
-        {"aten::quantized_lstm", ""},
-        {"aten::set_", "source_Storage"},
-        {"aten::set_", "source_Storage_storage_offset"},
-        {"aten::set_quantizer_", ""},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_add", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_fill_", "dimname_Scalar"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_fill_", "dimname_Scalar"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_fill", "dimname_Scalar"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_fill", "dimname_Tensor"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::scatter", "dimname_src"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::scatter", "dimname_value"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::scatter_add", "dimname"},
-    #endif
-        {"aten::addbmm", "out"},
-        {"aten::diag", "out"},
-        {"aten::cross", "out"},
-        {"aten::triu", "out"},
-        {"aten::tril", "out"},
-        {"aten::tril_indices", ""},
-        {"aten::triu_indices", ""},
-        {"aten::ne", "Scalar_out"},
-        {"aten::ne", "Tensor_out"},
-        {"aten::eq", "Scalar_out"},
-        {"aten::eq", "Tensor_out"},
-        {"aten::ge", "Scalar_out"},
-        {"aten::ge", "Tensor_out"},
-        {"aten::le", "Scalar_out"},
-        {"aten::le", "Tensor_out"},
-        {"aten::gt", "Scalar_out"},
-        {"aten::gt", "Tensor_out"},
-        {"aten::lt", "Scalar_out"},
-        {"aten::lt", "Tensor_out"},
-        {"aten::take", "out"},
-        {"aten::index_select", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_select", "dimname_out"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::index_select", "dimname"},
-    #endif
-        {"aten::masked_select", "out"},
-        {"aten::nonzero", "out"},
-        {"aten::gather", "out"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::gather", "dimname_out"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::gather", "dimname"},
-    #endif
-        {"aten::addcmul", "out"},
-        {"aten::addcdiv", "out"},
-        {"aten::lstsq", "X"},
-        {"aten::triangular_solve", "X"},
-        {"aten::symeig", "e"},
-        {"aten::eig", "e"},
-        {"aten::svd", "U"},
-        {"aten::cholesky", "out"},
-        {"aten::cholesky_solve", "out"},
-        {"aten::solve", "solution"},
-        {"aten::cholesky_inverse", "out"},
-        {"aten::qr", "Q"},
-        {"aten::geqrf", "a"},
-        {"aten::orgqr", "out"},
-        {"aten::ormqr", "out"},
-        {"aten::lu_solve", "out"},
-        {"aten::multinomial", "out"},
-        {"aten::lgamma", "out"},
-        {"aten::digamma", "out"},
-        {"aten::polygamma", "out"},
-        {"aten::erfinv", "out"},
-        {"aten::sign", "out"},
-        {"aten::atan2", "out"},
-        {"aten::lerp", "Scalar_out"},
-        {"aten::lerp", "Tensor_out"},
-        {"aten::histc", "out"},
-        {"aten::fmod", "Scalar_out"},
-        {"aten::fmod", "Tensor_out"},
-        {"aten::remainder", "Scalar_out"},
-        {"aten::remainder", "Tensor_out"},
-        {"aten::min", "out"},
-        {"aten::max", "out"},
-        {"aten::sort", "values"},
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::sort", "dimname_values"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::sort", "dimname"},
-    #endif
-    #ifdef BUILD_NAMEDTENSOR
-        {"aten::argsort", "dimname"},
-    #endif
-        {"aten::topk", "values"},
-        {"aten::renorm", "out"},
-        {"aten::pow", "Tensor_Tensor_out"},
-        {"aten::pow", "Scalar_out"},
-        {"aten::normal", "Tensor_float_out"},
-        {"aten::normal", "float_Tensor_out"},
-        {"aten::normal", "Tensor_Tensor_out"},
-        {"aten::normal", "float_float"},
-        {"aten::normal", "float_float_out"},
-        {"aten::_addr", "out"},
-        {"aten::_cumsum", "out"},
-        {"aten::_cumprod", "out"},
-        {"aten::_cat", "out"},
-        {"aten::_mode", "values"},
-        {"aten::_max", "max"},
-        {"aten::_min", "min"},
-        {"aten::binary_cross_entropy", "out"},
-        {"aten::binary_cross_entropy", ""},
-        {"aten::binary_cross_entropy_backward", "grad_input"},
-        {"aten::binary_cross_entropy_backward", ""},
-        {"aten::mse_loss", "out"},
-        {"aten::mse_loss_backward", "grad_input"},
-        {"aten::l1_loss", "out"},
-        {"aten::l1_loss_backward", "grad_input"},
-        {"aten::multi_margin_loss", "out"},
-        {"aten::multi_margin_loss", ""},
-        {"aten::multi_margin_loss_backward", "grad_input"},
-        {"aten::multi_margin_loss_backward", ""},
-        {"aten::multilabel_margin_loss", "out"},
-        {"aten::multilabel_margin_loss_forward", "output"},
-        {"aten::multilabel_margin_loss_backward", "grad_input"},
-        {"aten::nll_loss", "out"},
-        {"aten::nll_loss", ""},
-        {"aten::nll_loss_forward", "output"},
-        {"aten::nll_loss_forward", ""},
-        {"aten::nll_loss_backward", "grad_input"},
-        {"aten::nll_loss_backward", ""},
-        {"aten::nll_loss2d", "out"},
-        {"aten::nll_loss2d", ""},
-        {"aten::nll_loss2d_forward", "output"},
-        {"aten::nll_loss2d_forward", ""},
-        {"aten::nll_loss2d_backward", "grad_input"},
-        {"aten::nll_loss2d_backward", ""},
-        {"aten::smooth_l1_loss", "out"},
-        {"aten::smooth_l1_loss_backward", "grad_input"},
-        {"aten::soft_margin_loss", "out"},
-        {"aten::soft_margin_loss_backward", "grad_input"},
-        {"aten::elu", "out"},
-        {"aten::elu_backward", "grad_input"},
-        {"aten::glu", "out"},
-        {"aten::glu_backward", "grad_input"},
-        {"aten::hardtanh", "out"},
-        {"aten::hardtanh_backward", "grad_input"},
-        {"aten::leaky_relu", "out"},
-        {"aten::leaky_relu_backward", "grad_input"},
-        {"aten::log_sigmoid", "out"},
-        {"aten::log_sigmoid_forward", "output"},
-        {"aten::log_sigmoid_backward", "grad_input"},
-        {"aten::rrelu_with_noise", "out"},
-        {"aten::rrelu_with_noise_backward", "grad_input"},
-        {"aten::softplus", "out"},
-        {"aten::softplus_backward", "grad_input"},
-        {"aten::softshrink", "out"},
-        {"aten::softshrink_backward", "grad_input"},
-        {"aten::adaptive_avg_pool2d", "out"},
-        {"aten::adaptive_avg_pool3d", "out"},
-        {"aten::adaptive_avg_pool3d_backward", "grad_input"},
-        {"aten::adaptive_max_pool2d", "out"},
-        {"aten::adaptive_max_pool2d_backward", "grad_input"},
-        {"aten::adaptive_max_pool3d", "out"},
-        {"aten::adaptive_max_pool3d_backward", "grad_input"},
-        {"aten::avg_pool2d", "out"},
-        {"aten::avg_pool2d_backward", "grad_input"},
-        {"aten::avg_pool3d", "out"},
-        {"aten::avg_pool3d_backward", "grad_input"},
-        {"aten::fractional_max_pool2d", "output"},
-        {"aten::fractional_max_pool2d_backward", "grad_input"},
-        {"aten::fractional_max_pool3d", "output"},
-        {"aten::fractional_max_pool3d_backward", "grad_input"},
-        {"aten::max_pool2d_with_indices", "out"},
-        {"aten::max_pool2d_with_indices_backward", "grad_input"},
-        {"aten::max_pool3d_with_indices", "out"},
-        {"aten::max_pool3d_with_indices_backward", "grad_input"},
-        {"aten::max_unpool2d", "out"},
-        {"aten::max_unpool2d_backward", "grad_input"},
-        {"aten::max_unpool3d", "out"},
-        {"aten::max_unpool3d_backward", "grad_input"},
-        {"aten::reflection_pad1d", "out"},
-        {"aten::reflection_pad1d_backward", "grad_input"},
-        {"aten::reflection_pad2d", "out"},
-        {"aten::reflection_pad2d_backward", "grad_input"},
-        {"aten::replication_pad1d", "out"},
-        {"aten::replication_pad1d_backward", "grad_input"},
-        {"aten::replication_pad2d", "out"},
-        {"aten::replication_pad2d_backward", "grad_input"},
-        {"aten::replication_pad3d", "out"},
-        {"aten::replication_pad3d_backward", "grad_input"},
-        {"aten::upsample_linear1d", "out"},
-        {"aten::upsample_linear1d_backward", "grad_input"},
-        {"aten::upsample_bilinear2d", "out"},
-        {"aten::upsample_bilinear2d_backward", "grad_input"},
-        {"aten::upsample_bicubic2d", "out"},
-        {"aten::upsample_bicubic2d_backward", "grad_input"},
-        {"aten::upsample_trilinear3d", "out"},
-        {"aten::upsample_trilinear3d_backward", "grad_input"},
-        {"aten::upsample_nearest1d", "out"},
-        {"aten::upsample_nearest1d_backward", "grad_input"},
-        {"aten::upsample_nearest2d", "out"},
-        {"aten::upsample_nearest2d_backward", "grad_input"},
-        {"aten::upsample_nearest3d", "out"},
-        {"aten::upsample_nearest3d_backward", "grad_input"},
-        {"aten::sigmoid_backward", "grad_input"},
-        {"aten::tanh_backward", "grad_input"},
-        {"aten::slow_conv_transpose2d", "out"},
-        {"aten::slow_conv_transpose2d", ""},
-        {"aten::slow_conv_transpose2d_backward", "grad_output"},
-        {"aten::slow_conv_transpose3d", "out"},
-        {"aten::slow_conv_transpose3d", ""},
-        {"aten::slow_conv_transpose3d_backward", "grad_output"},
-        {"aten::thnn_conv2d", "out"},
-        {"aten::thnn_conv2d", ""},
-        {"aten::thnn_conv2d_forward", "output"},
-        {"aten::thnn_conv2d_forward", ""},
-        {"aten::thnn_conv2d_backward", "grad_input"},
-        {"aten::thnn_conv_depthwise2d", "out"},
-        {"aten::thnn_conv_depthwise2d", ""},
-        {"aten::thnn_conv_depthwise2d_forward", "out"},
-        {"aten::thnn_conv_depthwise2d_forward", ""},
-        {"aten::thnn_conv_depthwise2d_backward", "grad_input"},
-        {"aten::thnn_conv3d", "out"},
-        {"aten::thnn_conv3d", ""},
-        {"aten::thnn_conv3d_forward", "output"},
-        {"aten::thnn_conv3d_forward", ""},
-        {"aten::thnn_conv3d_backward", "grad_input"},
-        {"aten::slow_conv_dilated2d", ""},
-        {"aten::slow_conv_dilated3d", ""},
-        {"aten::col2im", "out"},
-        {"aten::col2im_backward", "grad_input"},
-        {"aten::im2col", "out"},
-        {"aten::im2col_backward", "grad_input"},
-    {"", ""}
-  };
-  return ops.count(std::make_pair(opName.name.c_str(), opName.overload_name.c_str())) != 0;
-}
-
-}
diff --git a/aten/src/ATen/core/TensorBody.h b/aten/src/ATen/core/TensorBody.h
deleted file mode 100644
index edd2be2d1cdaf..0000000000000
--- a/aten/src/ATen/core/TensorBody.h
+++ /dev/null
@@ -1,1023 +0,0 @@
-#pragma once
-
-#include <c10/core/Device.h>
-#include <c10/core/Layout.h>
-#include <c10/core/MemoryFormat.h>
-#include <c10/core/QScheme.h>
-#include <c10/core/Scalar.h>
-#include <c10/core/ScalarType.h>
-#include <c10/core/Storage.h>
-#include <ATen/core/TensorAccessor.h>
-#include <c10/core/TensorImpl.h>
-#include <c10/core/UndefinedTensorImpl.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Deprecated.h>
-#include <c10/util/Optional.h>
-#include <c10/util/intrusive_ptr.h>
-#include <ATen/core/DeprecatedTypePropertiesRegistry.h>
-#include <ATen/core/DeprecatedTypeProperties.h>
-#include <ATen/core/EnableNamedTensor.h>
-#include <ATen/core/NamedTensor.h>
-
-namespace caffe2 {
-class Tensor;
-}
-namespace c10{
-struct TensorOptions;
-}
-namespace at {
-struct Generator;
-struct Type;
-class DeprecatedTypeProperties;
-class Tensor;
-} // namespace at
-
-namespace at {
-
-class Tensor;
-using TensorList = ArrayRef<Tensor>;
-
-struct Quantizer;
-// This is temporary typedef to enable Quantizer in aten native function API
-// we'll remove them when we are actually exposing Quantizer class
-// to frontend
-using QuantizerPtr = c10::intrusive_ptr<Quantizer>;
-using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
-
-// Tensor is a "generic" object holding a pointer to the underlying TensorImpl object, which
-// has an embedded reference count. In this way, Tensor is similar to boost::intrusive_ptr.
-//
-// For example:
-//
-// void func(Tensor a) {
-//   Tensor b = a;
-//   ...
-// }
-//
-// In this example, when we say Tensor b = a, we are creating a new object that points to the
-// same underlying TensorImpl, and bumps its reference count. When b goes out of scope, the
-// destructor decrements the reference count by calling release() on the TensorImpl it points to.
-// The existing constructors, operator overloads, etc. take care to implement the correct semantics.
-//
-// Note that Tensor can also be NULL, i.e. it is not associated with any underlying TensorImpl, and
-// special care must be taken to handle this.
-class CAFFE2_API Tensor {
- public:
-  Tensor(){};
-  // This constructor should not be used by end users and is an implementation
-  // detail invoked by autogenerated code.
-  explicit Tensor(
-      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl)
-      : impl_(std::move(tensor_impl)) {
-    if (impl_.get() == nullptr) {
-      throw std::runtime_error("TensorImpl with nullptr is not supported");
-    }
-  }
-  Tensor(const Tensor&) = default;
-  Tensor(Tensor&&) = default;
-
-
- public:
-  // Creates a new wrapper from TensorImpl. Intentionally a free method because
-  // it should be used with care. Checks necessary invariants
-  static Tensor wrap_tensor_impl(
-      c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> tensor_impl) {
-    Tensor r(std::move(tensor_impl));
-    r.enforce_invariants();
-    return r;
-  }
-
-  int64_t dim() const {
-    return impl_->dim();
-  }
-  int64_t storage_offset() const {
-    return impl_->storage_offset();
-  }
-
-  TensorImpl * unsafeGetTensorImpl() const {
-    return impl_.get();
-  }
-  TensorImpl * unsafeReleaseTensorImpl() {
-    return impl_.release();
-  }
-  const c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl>& getIntrusivePtr() const {
-    return impl_;
-  }
-
-  bool defined() const {
-    return impl_;
-  }
-
-  void reset() {
-    impl_.reset();
-  }
-
-  // The following overloads are very intruiging.  Consider the following
-  // program:
-  //
-  //    x[1] = 3;
-  //
-  // We would expect that the first entry of x is written to 3.  But how can we
-  // actually achieve this?  x[1] evaluates to a tensor...
-  //
-  // The answer is, using a ref-qualifier.  x[1] is an rvalue, which cannot be
-  // (profitably) assigned to in the traditional sense, so we overload
-  // assignment to mean, "Actually, copy 3 into the tensor data."  This is done
-  // with an rvalue-reference ref-qualified overload (the methods with && at the
-  // end of their type.)
-  //
-  // There's one more fly in the ointment: We also want
-  //
-  //    Tensor x = y;
-  //
-  // to work, and we want it NOT to copy.  So we need a traditional operator=
-  // overload.  But we MUST specify a mutable lvalue ref-qualifier, to
-  // disambiguate the traditional overload from the rvalue-reference
-  // ref-qualified overload.  Otherwise, it will be ambiguous, because
-  // a non ref-qualified method is eligible for all situations.
-
-  // Unfortunately, we have to write these constructors out manually
-  // to work around an MSVC bug:
-  //    error C2580: 'at::Tensor &at::Tensor::operator =(const at::Tensor &) &':
-  //    multiple versions of a defaulted special member functions are not allowed
-  // Tensor& operator=(const Tensor&) & = default;
-  // Tensor& operator=(Tensor&&) & = default;
-  Tensor& operator=(const Tensor& x) & {
-    impl_ = x.impl_;
-    return *this;
-  }
-  Tensor& operator=(Tensor&& x) & {
-    impl_ = std::move(x.impl_);
-    return *this;
-  }
-
-  Tensor& operator=(Scalar v) &&;
-  Tensor& operator=(const Tensor&) &&;
-  Tensor& operator=(Tensor&&) &&;
-
-  bool is_same(const Tensor& other) const noexcept {
-    return impl_ == other.impl_;
-  }
-  size_t use_count() const noexcept {
-    return impl_.use_count();
-  }
-  size_t weak_use_count() const noexcept {
-    return impl_.weak_use_count();
-  }
-
-  std::string toString() const;
-
-  IntArrayRef sizes() const {
-    return impl_->sizes();
-  }
-  IntArrayRef strides() const {
-    return impl_->strides();
-  }
-#ifdef BUILD_NAMEDTENSOR
-  // See impl::get_opt_names in ATen/NamedTensor.h for docs.
-  optional<DimnameList> opt_names() const {
-    return impl::get_opt_names(unsafeGetTensorImpl());
-  }
-  // See impl::get_names in ATen/NamedTensor.h for docs.
-  DimnameList names() const {
-    return impl::get_names(unsafeGetTensorImpl());
-  }
-#endif
-  int64_t ndimension() const {
-    return dim();
-  }
-
-  bool is_contiguous(at::MemoryFormat memory_format=at::MemoryFormat::Contiguous) const {
-    return impl_->is_contiguous(memory_format);
-  }
-
-  bool is_non_overlapping_and_dense() const {
-    return impl_->is_non_overlapping_and_dense();
-  }
-
-  at::MemoryFormat suggest_memory_format() const {
-    if (impl_->is_strides_like_channels_last()) {
-      return at::MemoryFormat::ChannelsLast;
-    }
-    return at::MemoryFormat::Contiguous;
-  }
-
-  // Total bytes consumed by the "view" of elements of the array.  Does not
-  // include size of metadata.  The number reported here does not necessarily
-  // correspond to the true physical memory consumed by a tensor; instead,
-  // it reports the memory the tensor would take *if* it were contiguous.
-  // Defined to be numel() * itemsize()
-  size_t nbytes() const {
-    return impl_->numel() * impl_->itemsize();
-  }
-
-  // Length of one array element in bytes.  This is the traditional
-  // Numpy naming.
-  size_t itemsize() const {
-    return impl_->itemsize();
-  }
-
-  // Same as itemsize().  This is the PyTorch naming.
-  size_t element_size() const {
-    return impl_->itemsize();
-  }
-
-  DeprecatedTypeProperties & type() const {
-    return globalDeprecatedTypePropertiesRegistry().getDeprecatedTypeProperties(
-        tensorTypeIdToBackend(legacyExtractTypeId(type_set())),
-        scalar_type(),
-        is_variable());
-  }
-  TensorTypeSet type_set() const {
-    return impl_->type_set();
-  }
-  ScalarType scalar_type() const {
-    return typeMetaToScalarType(impl_->dtype());
-  }
-  bool has_storage() const {
-    return defined() && impl_->has_storage();
-  }
-  const Storage& storage() const {
-    return impl_->storage();
-  }
-  bool is_alias_of(const at::Tensor& other) const{
-    return impl_->storage().is_alias_of(other.storage());
-  }
-  Tensor toType(ScalarType t) const;
-  Tensor toBackend(Backend b) const;
-
-  /// Returns true if the `Tensor` is actually a `torch::autograd::Variable`.
-  /// Defined in Type.h because of include order issues.
-  bool is_variable() const noexcept {
-    return impl_->is_variable();
-  }
-
-  /// Returns a `Tensor`'s layout. Defined in Type.h
-  Layout layout() const noexcept;
-
-  /// Returns a `Tensor`'s dtype (`TypeMeta`). Defined in TensorMethods.h
-  caffe2::TypeMeta dtype() const noexcept;
-
-  /// Returns a `Tensor`'s device.
-  Device device() const;
-
-  /// Returns a `Tensor`'s device index.
-  int64_t get_device() const;
-
-  /// Returns if a `Tensor` has CUDA backend.
-  bool is_cuda() const;
-
-  /// Returns if a `Tensor` has HIP backend.
-  bool is_hip() const;
-
-  /// Returns if a `Tensor` has sparse backend.
-  bool is_sparse() const;
-
-  /// Returns if a `Tensor` is mkldnn tensor.
-  bool is_mkldnn() const;
-
-  /// Returns if a `Tensor` has quantized backend.
-  bool is_quantized() const;
-
-  /// If a tensor is a quantized tensor, returns its quantizer
-  /// TODO: it's not in native_functions.yaml yet as it's not exposed to python
-  QuantizerPtr quantizer() const;
-
-#ifdef BUILD_NAMEDTENSOR
-  /// Returns if a `Tensor` has any dimension names
-  bool has_names() const;
-
-  /// Returns a `Tensor`'s dimension names data structure
-  const NamedTensorMeta* get_named_tensor_meta() const;
-  NamedTensorMeta* get_named_tensor_meta();
-#endif
-
-  /// Returns the `TensorOptions` corresponding to this `Tensor`. Defined in
-  /// TensorOptions.h.
-  TensorOptions options() const;
-
-  void* data_ptr() const {
-    return this->unsafeGetTensorImpl()->data();
-  }
-
-  template <typename T>
-  T * data_ptr() const;
-
-  template<typename T>
-  C10_DEPRECATED_MESSAGE("Tensor.data<T>() is deprecated. Please use Tensor.data_ptr<T>() instead.")
-  T * data() const {
-    return data_ptr<T>();
-  }
-
-  template <typename T>
-  T item() const;
-
-  // Purposely not defined here to avoid inlining
-  void print() const;
-
-  // Return a `TensorAccessor` for CPU `Tensor`s. You have to specify scalar type and
-  // dimension.
-  template<typename T, size_t N>
-  TensorAccessor<T,N> accessor() const& {
-    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
-    TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
-    return TensorAccessor<T,N>(data_ptr<T>(),sizes().data(),strides().data());
-  }
-  template<typename T, size_t N>
-  TensorAccessor<T,N> accessor() && = delete;
-
-  // Return a `GenericPackedTensorAccessor` for CUDA `Tensor`s. You have to specify scalar type and
-  // dimension. You can optionally specify RestrictPtrTraits as a template parameter to
-  // cast the data pointer to a __restrict__ pointer.
-  // In order to use this, your CUDA kernel has to take a corresponding GenericPackedTensorAccessor
-  // as an argument.
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> generic_packed_accessor() const& {
-    static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr<T>()");
-    TORCH_CHECK(dim() == N, "expected ", N, " dims but tensor has ", dim());
-    return GenericPackedTensorAccessor<T,N,PtrTraits,index_t>(static_cast<typename PtrTraits<T>::PtrType>(data_ptr<T>()),sizes().data(),strides().data());
-  }
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-  GenericPackedTensorAccessor<T,N> generic_packed_accessor() && = delete;
-
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
-  PackedTensorAccessor32<T,N,PtrTraits> packed_accessor32() const& {
-    return generic_packed_accessor<T,N,PtrTraits,int32_t>();
-  }
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
-  PackedTensorAccessor32<T,N,PtrTraits> packed_accessor32() && = delete;
-
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
-  PackedTensorAccessor64<T,N,PtrTraits> packed_accessor64() const& {
-    return generic_packed_accessor<T,N,PtrTraits,int64_t>();
-  }
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits>
-  PackedTensorAccessor64<T,N,PtrTraits> packed_accessor64() && = delete;
-
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
-  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() const & {
-    return generic_packed_accessor<T,N,PtrTraits,index_t>();
-  }
-  template<typename T, size_t N, template <typename U> class PtrTraits = DefaultPtrTraits, typename index_t = int64_t>
-  C10_DEPRECATED_MESSAGE("packed_accessor is deprecated, use packed_accessor32 or packed_accessor64 instead")
-  GenericPackedTensorAccessor<T,N,PtrTraits,index_t> packed_accessor() && = delete;
-
-  Tensor operator-() const;
-  Tensor& operator+=(const Tensor & other);
-  Tensor& operator+=(Scalar other);
-  Tensor& operator-=(const Tensor & other);
-  Tensor& operator-=(Scalar other);
-  Tensor& operator*=(const Tensor & other);
-  Tensor& operator*=(Scalar other);
-  Tensor& operator/=(const Tensor & other);
-  Tensor& operator/=(Scalar other);
-  Tensor operator[](Scalar index) const;
-  Tensor operator[](Tensor index) const;
-  Tensor operator[](int64_t index) const;
-
-  Tensor cpu() const;
-  Tensor cuda() const;
-  Tensor hip() const;
-
-  // ~~~~~ Autograd API ~~~~~
-
-  Tensor& set_requires_grad(bool requires_grad) {
-    impl_->set_requires_grad(requires_grad);
-    return *this;
-  }
-  bool requires_grad() const {
-    return impl_->requires_grad();
-  }
-
-  Tensor& grad() {
-    return impl_->grad();
-  }
-  const Tensor& grad() const {
-    return impl_->grad();
-  }
-
-  // STOP.  Thinking of adding a method here, which only makes use
-  // of other ATen methods?  Define it in native_functions.yaml.
-
-  //example
-  //Tensor * add(Tensor & b);
-  void backward(const Tensor & gradient={}, bool keep_graph=false, bool create_graph=false) const;
-  void set_data(const Tensor & new_data) const;
-  Tensor data() const;
-  bool is_leaf() const;
-  int64_t output_nr() const;
-  int64_t _version() const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor & rename_(c10::optional<DimnameList> names) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor rename(c10::optional<DimnameList> names) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor align_to(DimnameList names) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor align_as(const Tensor & other) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor refine_names(DimnameList names) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor unflatten(Dimname dim, IntArrayRef sizes, DimnameList names) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor unflatten(int64_t dim, IntArrayRef sizes, DimnameList names) const;
-  #endif
-  Tensor abs() const;
-  Tensor & abs_() const;
-  Tensor acos() const;
-  Tensor & acos_() const;
-  Tensor add(const Tensor & other, Scalar alpha=1) const;
-  Tensor & add_(const Tensor & other, Scalar alpha=1) const;
-  Tensor add(Scalar other, Scalar alpha=1) const;
-  Tensor & add_(Scalar other, Scalar alpha=1) const;
-  Tensor addmv(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
-  Tensor & addmv_(const Tensor & mat, const Tensor & vec, Scalar beta=1, Scalar alpha=1) const;
-  Tensor addr(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor & addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor all(int64_t dim, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor all(Dimname dim, bool keepdim=false) const;
-  #endif
-  bool allclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
-  Tensor any(int64_t dim, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor any(Dimname dim, bool keepdim=false) const;
-  #endif
-  Tensor argmax(c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) const;
-  Tensor argmin(c10::optional<int64_t> dim=c10::nullopt, bool keepdim=false) const;
-  Tensor as_strided(IntArrayRef size, IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) const;
-  Tensor & as_strided_(IntArrayRef size, IntArrayRef stride, c10::optional<int64_t> storage_offset=c10::nullopt) const;
-  Tensor asin() const;
-  Tensor & asin_() const;
-  Tensor atan() const;
-  Tensor & atan_() const;
-  Tensor baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor & baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor bernoulli(Generator * generator=nullptr) const;
-  Tensor & bernoulli_(const Tensor & p, Generator * generator=nullptr) const;
-  Tensor & bernoulli_(double p=0.5, Generator * generator=nullptr) const;
-  Tensor bernoulli(double p, Generator * generator=nullptr) const;
-  Tensor bincount(const Tensor & weights={}, int64_t minlength=0) const;
-  Tensor bitwise_not() const;
-  Tensor & bitwise_not_() const;
-  Tensor logical_not() const;
-  Tensor & logical_not_() const;
-  Tensor logical_xor(const Tensor & other) const;
-  Tensor & logical_xor_(const Tensor & other) const;
-  Tensor bmm(const Tensor & mat2) const;
-  Tensor ceil() const;
-  Tensor & ceil_() const;
-  std::vector<Tensor> chunk(int64_t chunks, int64_t dim=0) const;
-  Tensor clamp(c10::optional<Scalar> min=c10::nullopt, c10::optional<Scalar> max=c10::nullopt) const;
-  Tensor & clamp_(c10::optional<Scalar> min=c10::nullopt, c10::optional<Scalar> max=c10::nullopt) const;
-  Tensor clamp_max(Scalar max) const;
-  Tensor & clamp_max_(Scalar max) const;
-  Tensor clamp_min(Scalar min) const;
-  Tensor & clamp_min_(Scalar min) const;
-  Tensor contiguous(MemoryFormat memory_format=MemoryFormat::Contiguous) const;
-  Tensor & copy_(const Tensor & src, bool non_blocking=false) const;
-  Tensor cos() const;
-  Tensor & cos_() const;
-  Tensor cosh() const;
-  Tensor & cosh_() const;
-  Tensor cumsum(int64_t dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor cumsum(Dimname dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #endif
-  Tensor cumprod(int64_t dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor cumprod(Dimname dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #endif
-  Tensor det() const;
-  Tensor diag_embed(int64_t offset=0, int64_t dim1=-2, int64_t dim2=-1) const;
-  Tensor diagflat(int64_t offset=0) const;
-  Tensor diagonal(int64_t offset=0, int64_t dim1=0, int64_t dim2=1) const;
-  Tensor & fill_diagonal_(Scalar fill_value, bool wrap=false) const;
-  Tensor div(const Tensor & other) const;
-  Tensor & div_(const Tensor & other) const;
-  Tensor div(Scalar other) const;
-  Tensor & div_(Scalar other) const;
-  Tensor dot(const Tensor & tensor) const;
-  Tensor new_empty(IntArrayRef size, const TensorOptions & options={}) const;
-  Tensor new_full(IntArrayRef size, Scalar fill_value, const TensorOptions & options={}) const;
-  Tensor new_zeros(IntArrayRef size, const TensorOptions & options={}) const;
-  Tensor & resize_(IntArrayRef size) const;
-  Tensor erf() const;
-  Tensor & erf_() const;
-  Tensor erfc() const;
-  Tensor & erfc_() const;
-  Tensor exp() const;
-  Tensor & exp_() const;
-  Tensor expm1() const;
-  Tensor & expm1_() const;
-  Tensor expand(IntArrayRef size, bool implicit=false) const;
-  Tensor expand_as(const Tensor & other) const;
-  Tensor flatten(int64_t start_dim=0, int64_t end_dim=-1) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor flatten(int64_t start_dim, int64_t end_dim, Dimname out_dim) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor flatten(Dimname start_dim, Dimname end_dim, Dimname out_dim) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor flatten(DimnameList dims, Dimname out_dim) const;
-  #endif
-  Tensor & fill_(Scalar value) const;
-  Tensor & fill_(const Tensor & value) const;
-  Tensor floor() const;
-  Tensor & floor_() const;
-  Tensor frac() const;
-  Tensor & frac_() const;
-  Tensor ger(const Tensor & vec2) const;
-  Tensor fft(int64_t signal_ndim, bool normalized=false) const;
-  Tensor ifft(int64_t signal_ndim, bool normalized=false) const;
-  Tensor rfft(int64_t signal_ndim, bool normalized=false, bool onesided=true) const;
-  Tensor irfft(int64_t signal_ndim, bool normalized=false, bool onesided=true, IntArrayRef signal_sizes={}) const;
-  Tensor index(TensorList indices) const;
-  Tensor & index_copy_(int64_t dim, const Tensor & index, const Tensor & source) const;
-  Tensor index_copy(int64_t dim, const Tensor & index, const Tensor & source) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor & index_copy_(Dimname dim, const Tensor & index, const Tensor & source) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor index_copy(Dimname dim, const Tensor & index, const Tensor & source) const;
-  #endif
-  Tensor & index_put_(TensorList indices, const Tensor & values, bool accumulate=false) const;
-  Tensor index_put(TensorList indices, const Tensor & values, bool accumulate=false) const;
-  Tensor inverse() const;
-  Tensor isclose(const Tensor & other, double rtol=1e-05, double atol=1e-08, bool equal_nan=false) const;
-  bool is_distributed() const;
-  bool is_floating_point() const;
-  bool is_complex() const;
-  bool is_nonzero() const;
-  bool is_same_size(const Tensor & other) const;
-  bool is_signed() const;
-  std::tuple<Tensor,Tensor> kthvalue(int64_t k, int64_t dim=-1, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  std::tuple<Tensor,Tensor> kthvalue(int64_t k, Dimname dim, bool keepdim=false) const;
-  #endif
-  Tensor log() const;
-  Tensor & log_() const;
-  Tensor log10() const;
-  Tensor & log10_() const;
-  Tensor log1p() const;
-  Tensor & log1p_() const;
-  Tensor log2() const;
-  Tensor & log2_() const;
-  Tensor logdet() const;
-  Tensor log_softmax(int64_t dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor log_softmax(Dimname dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #endif
-  Tensor logsumexp(IntArrayRef dim, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor logsumexp(DimnameList dim, bool keepdim=false) const;
-  #endif
-  Tensor matmul(const Tensor & other) const;
-  Tensor matrix_power(int64_t n) const;
-  std::tuple<Tensor,Tensor> max(int64_t dim, bool keepdim=false) const;
-  Tensor max_values(IntArrayRef dim, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  std::tuple<Tensor,Tensor> max(Dimname dim, bool keepdim=false) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor max_values(DimnameList dim, bool keepdim=false) const;
-  #endif
-  Tensor mean(c10::optional<ScalarType> dtype=c10::nullopt) const;
-  Tensor mean(IntArrayRef dim, bool keepdim=false, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor mean(DimnameList dim, bool keepdim=false, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #endif
-  std::tuple<Tensor,Tensor> median(int64_t dim, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  std::tuple<Tensor,Tensor> median(Dimname dim, bool keepdim=false) const;
-  #endif
-  std::tuple<Tensor,Tensor> min(int64_t dim, bool keepdim=false) const;
-  Tensor min_values(IntArrayRef dim, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  std::tuple<Tensor,Tensor> min(Dimname dim, bool keepdim=false) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor min_values(DimnameList dim, bool keepdim=false) const;
-  #endif
-  Tensor mm(const Tensor & mat2) const;
-  std::tuple<Tensor,Tensor> mode(int64_t dim=-1, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  std::tuple<Tensor,Tensor> mode(Dimname dim, bool keepdim=false) const;
-  #endif
-  Tensor mul(const Tensor & other) const;
-  Tensor & mul_(const Tensor & other) const;
-  Tensor mul(Scalar other) const;
-  Tensor & mul_(Scalar other) const;
-  Tensor mv(const Tensor & vec) const;
-  Tensor mvlgamma(int64_t p) const;
-  Tensor & mvlgamma_(int64_t p) const;
-  Tensor narrow_copy(int64_t dim, int64_t start, int64_t length) const;
-  Tensor narrow(int64_t dim, int64_t start, int64_t length) const;
-  Tensor permute(IntArrayRef dims) const;
-  Tensor numpy_T() const;
-  bool is_pinned() const;
-  Tensor pin_memory() const;
-  Tensor pinverse(double rcond=1e-15) const;
-  Tensor reciprocal() const;
-  Tensor & reciprocal_() const;
-  Tensor neg() const;
-  Tensor & neg_() const;
-  Tensor repeat(IntArrayRef repeats) const;
-  Tensor repeat_interleave(const Tensor & repeats, c10::optional<int64_t> dim=c10::nullopt) const;
-  Tensor repeat_interleave(int64_t repeats, c10::optional<int64_t> dim=c10::nullopt) const;
-  Tensor reshape(IntArrayRef shape) const;
-  Tensor reshape_as(const Tensor & other) const;
-  Tensor round() const;
-  Tensor & round_() const;
-  Tensor relu() const;
-  Tensor & relu_() const;
-  Tensor prelu(const Tensor & weight) const;
-  std::tuple<Tensor,Tensor> prelu_backward(const Tensor & grad_output, const Tensor & weight) const;
-  Tensor hardshrink(Scalar lambd=0.5) const;
-  Tensor hardshrink_backward(const Tensor & grad_out, Scalar lambd) const;
-  Tensor rsqrt() const;
-  Tensor & rsqrt_() const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor select(Dimname dim, int64_t index) const;
-  #endif
-  Tensor select(int64_t dim, int64_t index) const;
-  Tensor sigmoid() const;
-  Tensor & sigmoid_() const;
-  Tensor sin() const;
-  Tensor & sin_() const;
-  Tensor sinh() const;
-  Tensor & sinh_() const;
-  Tensor detach() const;
-  Tensor & detach_() const;
-  int64_t size(int64_t dim) const;
-  #ifdef BUILD_NAMEDTENSOR
-  int64_t size(Dimname dim) const;
-  #endif
-  Tensor slice(int64_t dim=0, int64_t start=0, int64_t end=9223372036854775807, int64_t step=1) const;
-  std::tuple<Tensor,Tensor> slogdet() const;
-  Tensor smm(const Tensor & mat2) const;
-  Tensor softmax(int64_t dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor softmax(Dimname dim, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #endif
-  std::vector<Tensor> split(int64_t split_size, int64_t dim=0) const;
-  std::vector<Tensor> split_with_sizes(IntArrayRef split_sizes, int64_t dim=0) const;
-  Tensor squeeze() const;
-  Tensor squeeze(int64_t dim) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor squeeze(Dimname dim) const;
-  #endif
-  Tensor & squeeze_() const;
-  Tensor & squeeze_(int64_t dim) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor & squeeze_(Dimname dim) const;
-  #endif
-  Tensor sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor stft(int64_t n_fft, c10::optional<int64_t> hop_length=c10::nullopt, c10::optional<int64_t> win_length=c10::nullopt, const Tensor & window={}, bool normalized=false, bool onesided=true) const;
-  int64_t stride(int64_t dim) const;
-  #ifdef BUILD_NAMEDTENSOR
-  int64_t stride(Dimname dim) const;
-  #endif
-  Tensor sum(c10::optional<ScalarType> dtype=c10::nullopt) const;
-  Tensor sum(IntArrayRef dim, bool keepdim=false, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor sum(DimnameList dim, bool keepdim=false, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #endif
-  Tensor sum_to_size(IntArrayRef size) const;
-  Tensor sqrt() const;
-  Tensor & sqrt_() const;
-  Tensor std(bool unbiased=true) const;
-  Tensor std(IntArrayRef dim, bool unbiased=true, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor std(DimnameList dim, bool unbiased=true, bool keepdim=false) const;
-  #endif
-  Tensor prod(c10::optional<ScalarType> dtype=c10::nullopt) const;
-  Tensor prod(int64_t dim, bool keepdim=false, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor prod(Dimname dim, bool keepdim=false, c10::optional<ScalarType> dtype=c10::nullopt) const;
-  #endif
-  Tensor t() const;
-  Tensor & t_() const;
-  Tensor tan() const;
-  Tensor & tan_() const;
-  Tensor tanh() const;
-  Tensor & tanh_() const;
-  Tensor transpose(int64_t dim0, int64_t dim1) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor transpose(Dimname dim0, Dimname dim1) const;
-  #endif
-  Tensor & transpose_(int64_t dim0, int64_t dim1) const;
-  Tensor flip(IntArrayRef dims) const;
-  Tensor roll(IntArrayRef shifts, IntArrayRef dims={}) const;
-  Tensor rot90(int64_t k=1, IntArrayRef dims={0,1}) const;
-  Tensor trunc() const;
-  Tensor & trunc_() const;
-  Tensor type_as(const Tensor & other) const;
-  Tensor unsqueeze(int64_t dim) const;
-  Tensor & unsqueeze_(int64_t dim) const;
-  Tensor var(bool unbiased=true) const;
-  Tensor var(IntArrayRef dim, bool unbiased=true, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor var(DimnameList dim, bool unbiased=true, bool keepdim=false) const;
-  #endif
-  Tensor view_as(const Tensor & other) const;
-  Tensor where(const Tensor & condition, const Tensor & other) const;
-  Tensor norm(c10::optional<Scalar> p, ScalarType dtype) const;
-  Tensor norm(Scalar p=2) const;
-  Tensor norm(c10::optional<Scalar> p, IntArrayRef dim, bool keepdim, ScalarType dtype) const;
-  Tensor norm(c10::optional<Scalar> p, IntArrayRef dim, bool keepdim=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor norm(c10::optional<Scalar> p, DimnameList dim, bool keepdim, ScalarType dtype) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor norm(c10::optional<Scalar> p, DimnameList dim, bool keepdim=false) const;
-  #endif
-  Tensor clone(c10::optional<MemoryFormat> memory_format=c10::nullopt) const;
-  Tensor & resize_as_(const Tensor & the_template) const;
-  Tensor pow(Scalar exponent) const;
-  Tensor & zero_() const;
-  Tensor sub(const Tensor & other, Scalar alpha=1) const;
-  Tensor & sub_(const Tensor & other, Scalar alpha=1) const;
-  Tensor sub(Scalar other, Scalar alpha=1) const;
-  Tensor & sub_(Scalar other, Scalar alpha=1) const;
-  Tensor addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor & addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor & sparse_resize_(IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const;
-  Tensor & sparse_resize_and_clear_(IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const;
-  Tensor sparse_mask(const Tensor & mask) const;
-  Tensor to_dense() const;
-  int64_t sparse_dim() const;
-  int64_t _dimI() const;
-  int64_t dense_dim() const;
-  int64_t _dimV() const;
-  int64_t _nnz() const;
-  Tensor coalesce() const;
-  bool is_coalesced() const;
-  Tensor _indices() const;
-  Tensor _values() const;
-  Tensor & _coalesced_(bool coalesced) const;
-  Tensor indices() const;
-  Tensor values() const;
-  int64_t numel() const;
-  std::vector<Tensor> unbind(int64_t dim=0) const;
-  #ifdef BUILD_NAMEDTENSOR
-  std::vector<Tensor> unbind(Dimname dim) const;
-  #endif
-  Tensor to_sparse(int64_t sparse_dim) const;
-  Tensor to_sparse() const;
-  Tensor to_mkldnn() const;
-  Tensor dequantize() const;
-  double q_scale() const;
-  int64_t q_zero_point() const;
-  Tensor q_per_channel_scales() const;
-  Tensor q_per_channel_zero_points() const;
-  int64_t q_per_channel_axis() const;
-  Tensor int_repr() const;
-  QScheme qscheme() const;
-  Tensor to(const TensorOptions & options, bool non_blocking=false, bool copy=false) const;
-  Tensor to(Device device, ScalarType dtype, bool non_blocking=false, bool copy=false) const;
-  Tensor to(ScalarType dtype, bool non_blocking=false, bool copy=false) const;
-  Tensor to(const Tensor & other, bool non_blocking=false, bool copy=false) const;
-  Scalar item() const;
-  Tensor & set_(Storage source) const;
-  Tensor & set_(Storage source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride={}) const;
-  Tensor & set_(const Tensor & source) const;
-  Tensor & set_() const;
-  Tensor & set_quantizer_(ConstQuantizerPtr quantizer) const;
-  bool is_set_to(const Tensor & tensor) const;
-  Tensor & masked_fill_(const Tensor & mask, Scalar value) const;
-  Tensor masked_fill(const Tensor & mask, Scalar value) const;
-  Tensor & masked_fill_(const Tensor & mask, const Tensor & value) const;
-  Tensor masked_fill(const Tensor & mask, const Tensor & value) const;
-  Tensor & masked_scatter_(const Tensor & mask, const Tensor & source) const;
-  Tensor masked_scatter(const Tensor & mask, const Tensor & source) const;
-  Tensor view(IntArrayRef size) const;
-  Tensor & put_(const Tensor & index, const Tensor & source, bool accumulate=false) const;
-  Tensor & index_add_(int64_t dim, const Tensor & index, const Tensor & source) const;
-  Tensor index_add(int64_t dim, const Tensor & index, const Tensor & source) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor index_add(Dimname dim, const Tensor & index, const Tensor & source) const;
-  #endif
-  Tensor & index_fill_(int64_t dim, const Tensor & index, Scalar value) const;
-  Tensor index_fill(int64_t dim, const Tensor & index, Scalar value) const;
-  Tensor & index_fill_(int64_t dim, const Tensor & index, const Tensor & value) const;
-  Tensor index_fill(int64_t dim, const Tensor & index, const Tensor & value) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor & index_fill_(Dimname dim, const Tensor & index, Scalar value) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor & index_fill_(Dimname dim, const Tensor & index, const Tensor & value) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor index_fill(Dimname dim, const Tensor & index, Scalar value) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor index_fill(Dimname dim, const Tensor & index, const Tensor & value) const;
-  #endif
-  Tensor & scatter_(int64_t dim, const Tensor & index, const Tensor & src) const;
-  Tensor scatter(int64_t dim, const Tensor & index, const Tensor & src) const;
-  Tensor & scatter_(int64_t dim, const Tensor & index, Scalar value) const;
-  Tensor scatter(int64_t dim, const Tensor & index, Scalar value) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor scatter(Dimname dim, const Tensor & index, const Tensor & src) const;
-  #endif
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor scatter(Dimname dim, const Tensor & index, Scalar value) const;
-  #endif
-  Tensor & scatter_add_(int64_t dim, const Tensor & index, const Tensor & src) const;
-  Tensor scatter_add(int64_t dim, const Tensor & index, const Tensor & src) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor scatter_add(Dimname dim, const Tensor & index, const Tensor & src) const;
-  #endif
-  Tensor & lt_(Scalar other) const;
-  Tensor & lt_(const Tensor & other) const;
-  Tensor & gt_(Scalar other) const;
-  Tensor & gt_(const Tensor & other) const;
-  Tensor & le_(Scalar other) const;
-  Tensor & le_(const Tensor & other) const;
-  Tensor & ge_(Scalar other) const;
-  Tensor & ge_(const Tensor & other) const;
-  Tensor & eq_(Scalar other) const;
-  Tensor & eq_(const Tensor & other) const;
-  Tensor & ne_(Scalar other) const;
-  Tensor & ne_(const Tensor & other) const;
-  Tensor __and__(Scalar other) const;
-  Tensor __and__(const Tensor & other) const;
-  Tensor & __iand__(Scalar other) const;
-  Tensor & __iand__(const Tensor & other) const;
-  Tensor __or__(Scalar other) const;
-  Tensor __or__(const Tensor & other) const;
-  Tensor & __ior__(Scalar other) const;
-  Tensor & __ior__(const Tensor & other) const;
-  Tensor __xor__(Scalar other) const;
-  Tensor __xor__(const Tensor & other) const;
-  Tensor & __ixor__(Scalar other) const;
-  Tensor & __ixor__(const Tensor & other) const;
-  Tensor __lshift__(Scalar other) const;
-  Tensor __lshift__(const Tensor & other) const;
-  Tensor & __ilshift__(Scalar other) const;
-  Tensor & __ilshift__(const Tensor & other) const;
-  Tensor __rshift__(Scalar other) const;
-  Tensor __rshift__(const Tensor & other) const;
-  Tensor & __irshift__(Scalar other) const;
-  Tensor & __irshift__(const Tensor & other) const;
-  Tensor & lgamma_() const;
-  Tensor & atan2_(const Tensor & other) const;
-  Tensor & tril_(int64_t diagonal=0) const;
-  Tensor & triu_(int64_t diagonal=0) const;
-  Tensor & digamma_() const;
-  Tensor & polygamma_(int64_t n) const;
-  Tensor & renorm_(Scalar p, int64_t dim, Scalar maxnorm) const;
-  Tensor & pow_(Scalar exponent) const;
-  Tensor & pow_(const Tensor & exponent) const;
-  Tensor & lerp_(const Tensor & end, Scalar weight) const;
-  Tensor & lerp_(const Tensor & end, const Tensor & weight) const;
-  Tensor & fmod_(Scalar other) const;
-  Tensor & fmod_(const Tensor & other) const;
-  Tensor & remainder_(Scalar other) const;
-  Tensor & remainder_(const Tensor & other) const;
-  Tensor & addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta=1, Scalar alpha=1) const;
-  Tensor & addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
-  Tensor & random_(int64_t from, int64_t to, Generator * generator=nullptr) const;
-  Tensor & random_(int64_t to, Generator * generator=nullptr) const;
-  Tensor & random_(Generator * generator=nullptr) const;
-  Tensor & uniform_(double from=0, double to=1, Generator * generator=nullptr) const;
-  Tensor & normal_(double mean=0, double std=1, Generator * generator=nullptr) const;
-  Tensor & cauchy_(double median=0, double sigma=1, Generator * generator=nullptr) const;
-  Tensor & log_normal_(double mean=1, double std=2, Generator * generator=nullptr) const;
-  Tensor & exponential_(double lambd=1, Generator * generator=nullptr) const;
-  Tensor & geometric_(double p, Generator * generator=nullptr) const;
-  Tensor diag(int64_t diagonal=0) const;
-  Tensor cross(const Tensor & other, c10::optional<int64_t> dim=c10::nullopt) const;
-  Tensor triu(int64_t diagonal=0) const;
-  Tensor tril(int64_t diagonal=0) const;
-  Tensor trace() const;
-  Tensor ne(Scalar other) const;
-  Tensor ne(const Tensor & other) const;
-  Tensor eq(Scalar other) const;
-  Tensor eq(const Tensor & other) const;
-  Tensor ge(Scalar other) const;
-  Tensor ge(const Tensor & other) const;
-  Tensor le(Scalar other) const;
-  Tensor le(const Tensor & other) const;
-  Tensor gt(Scalar other) const;
-  Tensor gt(const Tensor & other) const;
-  Tensor lt(Scalar other) const;
-  Tensor lt(const Tensor & other) const;
-  Tensor take(const Tensor & index) const;
-  Tensor index_select(int64_t dim, const Tensor & index) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor index_select(Dimname dim, const Tensor & index) const;
-  #endif
-  Tensor masked_select(const Tensor & mask) const;
-  Tensor nonzero() const;
-  std::vector<Tensor> nonzero_numpy() const;
-  Tensor gather(int64_t dim, const Tensor & index, bool sparse_grad=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor gather(Dimname dim, const Tensor & index, bool sparse_grad=false) const;
-  #endif
-  Tensor addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
-  Tensor & addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
-  Tensor addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value=1) const;
-  std::tuple<Tensor,Tensor> lstsq(const Tensor & A) const;
-  std::tuple<Tensor,Tensor> triangular_solve(const Tensor & A, bool upper=true, bool transpose=false, bool unitriangular=false) const;
-  std::tuple<Tensor,Tensor> symeig(bool eigenvectors=false, bool upper=true) const;
-  std::tuple<Tensor,Tensor> eig(bool eigenvectors=false) const;
-  std::tuple<Tensor,Tensor,Tensor> svd(bool some=true, bool compute_uv=true) const;
-  Tensor cholesky(bool upper=false) const;
-  Tensor cholesky_solve(const Tensor & input2, bool upper=false) const;
-  std::tuple<Tensor,Tensor> solve(const Tensor & A) const;
-  Tensor cholesky_inverse(bool upper=false) const;
-  std::tuple<Tensor,Tensor> qr(bool some=true) const;
-  std::tuple<Tensor,Tensor> geqrf() const;
-  Tensor orgqr(const Tensor & input2) const;
-  Tensor ormqr(const Tensor & input2, const Tensor & input3, bool left=true, bool transpose=false) const;
-  Tensor lu_solve(const Tensor & LU_data, const Tensor & LU_pivots) const;
-  Tensor multinomial(int64_t num_samples, bool replacement=false, Generator * generator=nullptr) const;
-  Tensor lgamma() const;
-  Tensor digamma() const;
-  Tensor polygamma(int64_t n) const;
-  Tensor erfinv() const;
-  Tensor & erfinv_() const;
-  Tensor sign() const;
-  Tensor & sign_() const;
-  Tensor dist(const Tensor & other, Scalar p=2) const;
-  Tensor atan2(const Tensor & other) const;
-  Tensor lerp(const Tensor & end, Scalar weight) const;
-  Tensor lerp(const Tensor & end, const Tensor & weight) const;
-  Tensor histc(int64_t bins=100, Scalar min=0, Scalar max=0) const;
-  Tensor fmod(Scalar other) const;
-  Tensor fmod(const Tensor & other) const;
-  Tensor remainder(Scalar other) const;
-  Tensor remainder(const Tensor & other) const;
-  Tensor min(const Tensor & other) const;
-  Tensor min() const;
-  Tensor max(const Tensor & other) const;
-  Tensor max() const;
-  Tensor median() const;
-  std::tuple<Tensor,Tensor> sort(int64_t dim=-1, bool descending=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  std::tuple<Tensor,Tensor> sort(Dimname dim, bool descending=false) const;
-  #endif
-  Tensor argsort(int64_t dim=-1, bool descending=false) const;
-  #ifdef BUILD_NAMEDTENSOR
-  Tensor argsort(Dimname dim, bool descending=false) const;
-  #endif
-  std::tuple<Tensor,Tensor> topk(int64_t k, int64_t dim=-1, bool largest=true, bool sorted=true) const;
-  Tensor all() const;
-  Tensor any() const;
-  Tensor renorm(Scalar p, int64_t dim, Scalar maxnorm) const;
-  Tensor unfold(int64_t dimension, int64_t size, int64_t step) const;
-  bool equal(const Tensor & other) const;
-  Tensor pow(const Tensor & exponent) const;
-  Tensor alias() const;
-
-  // We changed .dtype() to return a TypeMeta in #12766. Ideally, we want the
-  // at::kDouble and its friends to be TypeMeta's, but that hasn't happened yet.
-  // Before that change, we make this method to maintain BC for C++ usage like
-  // `x.to(y.dtype)`.
-  // TODO: remove following two after at::kDouble and its friends are TypeMeta's.
-  inline Tensor to(caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
-    return this->to(/*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
-  }
-  inline Tensor to(Device device, caffe2::TypeMeta type_meta, bool non_blocking=false, bool copy=false) const {
-    return this->to(device, /*scalar_type=*/typeMetaToScalarType(type_meta), non_blocking, copy);
-  }
-
-  template <typename F, typename... Args>
-  auto m(F func, Args&&... params) const -> decltype(func(*this, std::forward<Args>(params)...)) {
-    return func(*this, std::forward<Args>(params)...);
-  }
-
-protected:
-  friend class ::caffe2::Tensor;
-
-  void enforce_invariants();
-  c10::intrusive_ptr<TensorImpl, UndefinedTensorImpl> impl_;
-};
-
-namespace detail {
-// Helper creator for Tensor class which doesn't requires the users to pass
-// in an intrusive_ptr instead it just converts the argument passed to
-// requested intrusive_ptr type.
-template <typename T, typename... Args>
-Tensor make_tensor(Args&&... args) {
-  return Tensor(c10::make_intrusive<T>(std::forward<Args>(args)...));
-}
-
-} // namespace detail
-
-static inline TensorTypeId legacyExtractTypeId(const Tensor& t) {
-  return legacyExtractTypeId(t.type_set());
-}
-
-} // namespace at
diff --git a/aten/src/ATen/core/TensorMethods.h b/aten/src/ATen/core/TensorMethods.h
deleted file mode 100644
index fdcf7d39944cc..0000000000000
--- a/aten/src/ATen/core/TensorMethods.h
+++ /dev/null
@@ -1,6236 +0,0 @@
-#pragma once
-
-#include <c10/core/Scalar.h>
-#include <c10/core/MemoryFormat.h>
-#include <c10/core/QScheme.h>
-#include <c10/macros/Macros.h>
-#include <c10/core/TensorOptions.h>
-#include <c10/util/intrusive_ptr.h>
-#include <ATen/core/DeprecatedTypeProperties.h>
-#include <ATen/core/ATenDispatch.h>
-#include <ATen/core/dispatch/Dispatcher.h>
-#include <ATen/core/NamedTensor.h>
-#include <ATen/core/EnableNamedTensor.h>
-#include <ATen/core/LegacyTypeDispatch.h>
-
-#ifdef USE_STATIC_DISPATCH
-#include <ATen/TypeDefault.h>
-#include <ATen/CPUType.h>
-#include <ATen/QuantizedCPUType.h>
-#include <ATen/SparseCPUType.h>
-#endif
-
-namespace at {
-
-struct Quantizer;
-// This is temporary typedef to enable Quantizer in aten native function API
-// we'll remove them when we are actually exposing Quantizer class
-// to frontend
-using ConstQuantizerPtr = const c10::intrusive_ptr<Quantizer>&;
-
-inline Tensor Tensor::cpu() const {
-  return to(options().device(DeviceType::CPU), /*non_blocking*/ false, /*copy*/ false);
-}
-
-// TODO: The Python version also accepts arguments
-inline Tensor Tensor::cuda() const {
-  return to(options().device(DeviceType::CUDA), /*non_blocking*/ false, /*copy*/ false);
-}
-
-inline Tensor Tensor::hip() const {
-  return to(options().device(DeviceType::HIP), /*non_blocking*/ false, /*copy*/ false);
-}
-
-inline Tensor Tensor::toType(ScalarType t) const {
-  return to(options().dtype(t), /*non_blocking*/ false, /*copy*/ false);
-}
-
-// TODO: Deprecate me
-inline Tensor Tensor::toBackend(Backend b) const {
-  return to(options().device(backendToDeviceType(b)).layout(layout_from_backend(b)), /*non_blocking*/ false, /*copy*/ false);
-}
-
-inline TensorOptions Tensor::options() const {
-  return TensorOptions().dtype(dtype())
-                        .device(device())
-                        .layout(layout())
-                        .is_variable(is_variable());
-}
-
-// all static inline to allow for inlining of the non-dynamic part of dispatch
-inline void Tensor::backward(const Tensor & gradient, bool keep_graph, bool create_graph) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-     TypeDefault::backward(const_cast<Tensor&>(*this), gradient, keep_graph, create_graph);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::backward(Tensor self, Tensor? gradient=None, bool keep_graph=False, bool create_graph=False) -> void");
-    return table->callUnboxed<void, const Tensor &, const Tensor &, bool, bool>(const_cast<Tensor&>(*this), gradient, keep_graph, create_graph);
-#endif
-}
-inline void Tensor::set_data(const Tensor & new_data) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-     TypeDefault::set_data(const_cast<Tensor&>(*this), new_data);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::set_data(Tensor(a!) self, Tensor new_data) -> void");
-    return table->callUnboxed<void, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), new_data);
-#endif
-}
-inline Tensor Tensor::data() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::data(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::data", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline bool Tensor::is_leaf() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_leaf(const_cast<Tensor&>(*this));
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::is_leaf(Tensor self) -> bool");
-    return table->callUnboxed<bool, const Tensor &>(const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::output_nr() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::output_nr(const_cast<Tensor&>(*this));
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::output_nr(Tensor self) -> int");
-    return table->callUnboxed<int64_t, const Tensor &>(const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::_version() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::_version(const_cast<Tensor&>(*this));
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::_version(Tensor self) -> int");
-    return table->callUnboxed<int64_t, const Tensor &>(const_cast<Tensor&>(*this));
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor & Tensor::rename_(c10::optional<DimnameList> names) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::rename_(const_cast<Tensor&>(*this), names);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::rename_(Tensor(a!) self, Dimname[]? names) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, c10::optional<DimnameList>>(const_cast<Tensor&>(*this), names);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::rename(c10::optional<DimnameList> names) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::rename(const_cast<Tensor&>(*this), names);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::rename(Tensor(a) self, Dimname[]? names) -> Tensor(a)");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<DimnameList>>(const_cast<Tensor&>(*this), names);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::align_to(DimnameList names) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::align_to(const_cast<Tensor&>(*this), names);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::align_to(Tensor(a) self, DimnameList names) -> Tensor(a)");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList>(const_cast<Tensor&>(*this), names);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::align_as(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::align_as(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::align_as", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::refine_names(DimnameList names) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::refine_names(const_cast<Tensor&>(*this), names);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::refine_names(Tensor(a) self, DimnameList names) -> Tensor(a)");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList>(const_cast<Tensor&>(*this), names);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::unflatten(Dimname dim, IntArrayRef sizes, DimnameList names) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::unflatten(const_cast<Tensor&>(*this), dim, sizes, names);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::unflatten(Tensor self, Dimname dim, int[] sizes, DimnameList names) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, IntArrayRef, DimnameList>(const_cast<Tensor&>(*this), dim, sizes, names);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::unflatten(int64_t dim, IntArrayRef sizes, DimnameList names) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::unflatten(const_cast<Tensor&>(*this), dim, sizes, names);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::unflatten(Tensor self, int dim, int[] sizes, DimnameList names) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, IntArrayRef, DimnameList>(const_cast<Tensor&>(*this), dim, sizes, names);
-#endif
-}
-#endif
-inline Tensor Tensor::abs() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::abs(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::abs", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::abs_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::abs_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("abs_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::abs_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::acos() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::acos(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::acos", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::acos_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::acos_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("acos_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::acos_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::add(const Tensor & other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::add(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::add(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        default:
-            AT_ERROR("add not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::add", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor & Tensor::add_(const Tensor & other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::add_(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::add_(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        default:
-            AT_ERROR("add_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::add_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor Tensor::add(Scalar other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::add(const_cast<Tensor&>(*this), other, alpha);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::add", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor & Tensor::add_(Scalar other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::add_(const_cast<Tensor&>(*this), other, alpha);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::add_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor Tensor::addmv(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::addmv(const_cast<Tensor&>(*this), mat, vec, beta, alpha);
-            break;
-        default:
-            AT_ERROR("addmv not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addmv", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat, vec)), const_cast<Tensor&>(*this), mat, vec, beta, alpha);
-#endif
-}
-inline Tensor & Tensor::addmv_(const Tensor & mat, const Tensor & vec, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::addmv_(const_cast<Tensor&>(*this), mat, vec, beta, alpha);
-            break;
-        default:
-            AT_ERROR("addmv_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addmv_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat, vec)), const_cast<Tensor&>(*this), mat, vec, beta, alpha);
-#endif
-}
-inline Tensor Tensor::addr(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::addr(const_cast<Tensor&>(*this), vec1, vec2, beta, alpha);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addr", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, vec1, vec2)), const_cast<Tensor&>(*this), vec1, vec2, beta, alpha);
-#endif
-}
-inline Tensor & Tensor::addr_(const Tensor & vec1, const Tensor & vec2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::addr_(const_cast<Tensor&>(*this), vec1, vec2, beta, alpha);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addr_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, vec1, vec2)), const_cast<Tensor&>(*this), vec1, vec2, beta, alpha);
-#endif
-}
-inline Tensor Tensor::all(int64_t dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::all(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::all", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::all(Dimname dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::all(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::all.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-inline bool Tensor::allclose(const Tensor & other, double rtol, double atol, bool equal_nan) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::allclose(const_cast<Tensor&>(*this), other, rtol, atol, equal_nan);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::allclose", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &, const Tensor &, double, double, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, rtol, atol, equal_nan);
-#endif
-}
-inline Tensor Tensor::any(int64_t dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::any(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::any", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::any(Dimname dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::any(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::any.dimname(Tensor self, Dimname dim, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::argmax(c10::optional<int64_t> dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::argmax(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::argmax", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, c10::optional<int64_t>, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-inline Tensor Tensor::argmin(c10::optional<int64_t> dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::argmin(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::argmin", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, c10::optional<int64_t>, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-inline Tensor Tensor::as_strided(IntArrayRef size, IntArrayRef stride, c10::optional<int64_t> storage_offset) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::as_strided(const_cast<Tensor&>(*this), size, stride, storage_offset);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::as_strided(const_cast<Tensor&>(*this), size, stride, storage_offset);
-            break;
-        default:
-            AT_ERROR("as_strided not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::as_strided", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, IntArrayRef, c10::optional<int64_t>>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size, stride, storage_offset);
-#endif
-}
-inline Tensor & Tensor::as_strided_(IntArrayRef size, IntArrayRef stride, c10::optional<int64_t> storage_offset) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::as_strided_(const_cast<Tensor&>(*this), size, stride, storage_offset);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::as_strided_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, IntArrayRef, IntArrayRef, c10::optional<int64_t>>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size, stride, storage_offset);
-#endif
-}
-inline Tensor Tensor::asin() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::asin(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::asin", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::asin_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::asin_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("asin_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::asin_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::atan() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::atan(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::atan", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::atan_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::atan_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("atan_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::atan_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::baddbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::baddbmm(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-            break;
-        default:
-            AT_ERROR("baddbmm not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::baddbmm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, batch1, batch2)), const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-#endif
-}
-inline Tensor & Tensor::baddbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::baddbmm_(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-            break;
-        default:
-            AT_ERROR("baddbmm_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::baddbmm_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, batch1, batch2)), const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-#endif
-}
-inline Tensor Tensor::bernoulli(Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::bernoulli(const_cast<Tensor&>(*this), generator);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::bernoulli", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), generator);
-#endif
-}
-inline Tensor & Tensor::bernoulli_(const Tensor & p, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::bernoulli_(const_cast<Tensor&>(*this), p, generator);
-            break;
-        default:
-            AT_ERROR("bernoulli_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::bernoulli_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, p)), const_cast<Tensor&>(*this), p, generator);
-#endif
-}
-inline Tensor & Tensor::bernoulli_(double p, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::bernoulli_(const_cast<Tensor&>(*this), p, generator);
-            break;
-        default:
-            AT_ERROR("bernoulli_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::bernoulli_", "float"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p, generator);
-#endif
-}
-inline Tensor Tensor::bernoulli(double p, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::bernoulli(const_cast<Tensor&>(*this), p, generator);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::bernoulli", "p"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p, generator);
-#endif
-}
-inline Tensor Tensor::bincount(const Tensor & weights, int64_t minlength) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::bincount(const_cast<Tensor&>(*this), weights, minlength);
-            break;
-        default:
-            AT_ERROR("bincount not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::bincount(Tensor self, Tensor? weights=None, int minlength=0) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, const Tensor &, int64_t>(const_cast<Tensor&>(*this), weights, minlength);
-#endif
-}
-inline Tensor Tensor::bitwise_not() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::bitwise_not(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::bitwise_not", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::bitwise_not_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::bitwise_not_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::bitwise_not_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::logical_not() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::logical_not(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::logical_not", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::logical_not_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::logical_not_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::logical_not_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::logical_xor(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::logical_xor(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::logical_xor", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::logical_xor_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::logical_xor_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::logical_xor_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::bmm(const Tensor & mat2) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::bmm(const_cast<Tensor&>(*this), mat2);
-            break;
-        default:
-            AT_ERROR("bmm not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::bmm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat2)), const_cast<Tensor&>(*this), mat2);
-#endif
-}
-inline Tensor Tensor::ceil() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::ceil(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ceil", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::ceil_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::ceil_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ceil_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline std::vector<Tensor> Tensor::chunk(int64_t chunks, int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::chunk(const_cast<Tensor&>(*this), chunks, dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::chunk", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::vector<Tensor>, const Tensor &, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), chunks, dim);
-#endif
-}
-inline Tensor Tensor::clamp(c10::optional<Scalar> min, c10::optional<Scalar> max) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::clamp(const_cast<Tensor&>(*this), min, max);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::clamp", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, c10::optional<Scalar>, c10::optional<Scalar>>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), min, max);
-#endif
-}
-inline Tensor & Tensor::clamp_(c10::optional<Scalar> min, c10::optional<Scalar> max) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::clamp_(const_cast<Tensor&>(*this), min, max);
-            break;
-        default:
-            AT_ERROR("clamp_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::clamp_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, c10::optional<Scalar>, c10::optional<Scalar>>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), min, max);
-#endif
-}
-inline Tensor Tensor::clamp_max(Scalar max) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::clamp_max(const_cast<Tensor&>(*this), max);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::clamp_max", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), max);
-#endif
-}
-inline Tensor & Tensor::clamp_max_(Scalar max) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::clamp_max_(const_cast<Tensor&>(*this), max);
-            break;
-        default:
-            AT_ERROR("clamp_max_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::clamp_max_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), max);
-#endif
-}
-inline Tensor Tensor::clamp_min(Scalar min) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::clamp_min(const_cast<Tensor&>(*this), min);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::clamp_min", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), min);
-#endif
-}
-inline Tensor & Tensor::clamp_min_(Scalar min) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::clamp_min_(const_cast<Tensor&>(*this), min);
-            break;
-        default:
-            AT_ERROR("clamp_min_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::clamp_min_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), min);
-#endif
-}
-inline Tensor Tensor::contiguous(MemoryFormat memory_format) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::contiguous(const_cast<Tensor&>(*this), memory_format);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::contiguous(Tensor self, *, MemoryFormat memory_format=contiguous_format) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, MemoryFormat>(const_cast<Tensor&>(*this), memory_format);
-#endif
-}
-inline Tensor & Tensor::copy_(const Tensor & src, bool non_blocking) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::copy_(const_cast<Tensor&>(*this), src, non_blocking);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::copy_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, src)), const_cast<Tensor&>(*this), src, non_blocking);
-#endif
-}
-inline Tensor Tensor::cos() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cos(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cos", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::cos_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::cos_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("cos_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cos_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::cosh() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cosh(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cosh", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::cosh_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::cosh_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("cosh_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cosh_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::cumsum(int64_t dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cumsum(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::cumsum(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::cumsum(Dimname dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cumsum(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::cumsum.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#endif
-inline Tensor Tensor::cumprod(int64_t dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cumprod(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::cumprod(Tensor self, int dim, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::cumprod(Dimname dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cumprod(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::cumprod.dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#endif
-inline Tensor Tensor::det() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::det(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::det", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::diag_embed(int64_t offset, int64_t dim1, int64_t dim2) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::diag_embed(const_cast<Tensor&>(*this), offset, dim1, dim2);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::diag_embed", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), offset, dim1, dim2);
-#endif
-}
-inline Tensor Tensor::diagflat(int64_t offset) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::diagflat(const_cast<Tensor&>(*this), offset);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::diagflat", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), offset);
-#endif
-}
-inline Tensor Tensor::diagonal(int64_t offset, int64_t dim1, int64_t dim2) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::diagonal(const_cast<Tensor&>(*this), offset, dim1, dim2);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::diagonal", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), offset, dim1, dim2);
-#endif
-}
-inline Tensor & Tensor::fill_diagonal_(Scalar fill_value, bool wrap) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::fill_diagonal_(const_cast<Tensor&>(*this), fill_value, wrap);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fill_diagonal_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), fill_value, wrap);
-#endif
-}
-inline Tensor Tensor::div(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::div(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::div(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("div not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::div", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::div_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::div_(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::div_(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("div_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::div_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::div(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::div(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::div", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::div_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::div_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::div_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::dot(const Tensor & tensor) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::dot(const_cast<Tensor&>(*this), tensor);
-            break;
-        default:
-            AT_ERROR("dot not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::dot", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, tensor)), const_cast<Tensor&>(*this), tensor);
-#endif
-}
-inline Tensor Tensor::new_empty(IntArrayRef size, const TensorOptions & options) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::new_empty(const_cast<Tensor&>(*this), size, options);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::new_empty(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, IntArrayRef, const TensorOptions &>(const_cast<Tensor&>(*this), size, options);
-#endif
-}
-inline Tensor Tensor::new_full(IntArrayRef size, Scalar fill_value, const TensorOptions & options) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::new_full(const_cast<Tensor&>(*this), size, fill_value, options);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::new_full(Tensor self, int[] size, Scalar fill_value, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, IntArrayRef, Scalar, const TensorOptions &>(const_cast<Tensor&>(*this), size, fill_value, options);
-#endif
-}
-inline Tensor Tensor::new_zeros(IntArrayRef size, const TensorOptions & options) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::new_zeros(const_cast<Tensor&>(*this), size, options);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::new_zeros(Tensor self, int[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, IntArrayRef, const TensorOptions &>(const_cast<Tensor&>(*this), size, options);
-#endif
-}
-inline Tensor & Tensor::resize_(IntArrayRef size) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::resize_(const_cast<Tensor&>(*this), size);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::resize_(const_cast<Tensor&>(*this), size);
-            break;
-        default:
-            AT_ERROR("resize_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::resize_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size);
-#endif
-}
-inline Tensor Tensor::erf() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::erf(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::erf", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::erf_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::erf_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("erf_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::erf_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::erfc() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::erfc(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::erfc", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::erfc_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::erfc_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("erfc_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::erfc_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::exp() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::exp(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::exp", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::exp_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::exp_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("exp_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::exp_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::expm1() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::expm1(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::expm1", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::expm1_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::expm1_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::expm1_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::expand(IntArrayRef size, bool implicit) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::expand(const_cast<Tensor&>(*this), size, implicit);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::expand", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size, implicit);
-#endif
-}
-inline Tensor Tensor::expand_as(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::expand_as(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::expand_as", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::flatten(const_cast<Tensor&>(*this), start_dim, end_dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::flatten", "using_ints"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), start_dim, end_dim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::flatten(int64_t start_dim, int64_t end_dim, Dimname out_dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::flatten(const_cast<Tensor&>(*this), start_dim, end_dim, out_dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::flatten.named_out_dim(Tensor self, int start_dim, int end_dim, Dimname out_dim) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, int64_t, Dimname>(const_cast<Tensor&>(*this), start_dim, end_dim, out_dim);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::flatten(Dimname start_dim, Dimname end_dim, Dimname out_dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::flatten(const_cast<Tensor&>(*this), start_dim, end_dim, out_dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::flatten.using_names(Tensor self, Dimname start_dim, Dimname end_dim, Dimname out_dim) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, Dimname, Dimname>(const_cast<Tensor&>(*this), start_dim, end_dim, out_dim);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::flatten(DimnameList dims, Dimname out_dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::flatten(const_cast<Tensor&>(*this), dims, out_dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::flatten.DimnameList(Tensor self, DimnameList dims, Dimname out_dim) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, Dimname>(const_cast<Tensor&>(*this), dims, out_dim);
-#endif
-}
-#endif
-inline Tensor & Tensor::fill_(Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::fill_(const_cast<Tensor&>(*this), value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fill_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), value);
-#endif
-}
-inline Tensor & Tensor::fill_(const Tensor & value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::fill_(const_cast<Tensor&>(*this), value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fill_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, value)), const_cast<Tensor&>(*this), value);
-#endif
-}
-inline Tensor Tensor::floor() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::floor(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::floor", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::floor_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::floor_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::floor_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::frac() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::frac(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::frac", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::frac_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::frac_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("frac_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::frac_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::ger(const Tensor & vec2) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::ger(const_cast<Tensor&>(*this), vec2);
-            break;
-        default:
-            AT_ERROR("ger not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ger", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, vec2)), const_cast<Tensor&>(*this), vec2);
-#endif
-}
-inline Tensor Tensor::fft(int64_t signal_ndim, bool normalized) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::fft(const_cast<Tensor&>(*this), signal_ndim, normalized);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fft", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), signal_ndim, normalized);
-#endif
-}
-inline Tensor Tensor::ifft(int64_t signal_ndim, bool normalized) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::ifft(const_cast<Tensor&>(*this), signal_ndim, normalized);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ifft", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), signal_ndim, normalized);
-#endif
-}
-inline Tensor Tensor::rfft(int64_t signal_ndim, bool normalized, bool onesided) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::rfft(const_cast<Tensor&>(*this), signal_ndim, normalized, onesided);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::rfft", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), signal_ndim, normalized, onesided);
-#endif
-}
-inline Tensor Tensor::irfft(int64_t signal_ndim, bool normalized, bool onesided, IntArrayRef signal_sizes) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::irfft(const_cast<Tensor&>(*this), signal_ndim, normalized, onesided, signal_sizes);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::irfft", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, bool, bool, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), signal_ndim, normalized, onesided, signal_sizes);
-#endif
-}
-inline Tensor Tensor::index(TensorList indices) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index(const_cast<Tensor&>(*this), indices);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index.Tensor(Tensor self, Tensor?[] indices) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, TensorList>(const_cast<Tensor&>(*this), indices);
-#endif
-}
-inline Tensor & Tensor::index_copy_(int64_t dim, const Tensor & index, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_copy_(const_cast<Tensor&>(*this), dim, index, source);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_copy_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, source)), const_cast<Tensor&>(*this), dim, index, source);
-#endif
-}
-inline Tensor Tensor::index_copy(int64_t dim, const Tensor & index, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_copy(const_cast<Tensor&>(*this), dim, index, source);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_copy", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, source)), const_cast<Tensor&>(*this), dim, index, source);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor & Tensor::index_copy_(Dimname dim, const Tensor & index, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_copy_(const_cast<Tensor&>(*this), dim, index, source);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_copy_.dimname(Tensor(a!) self, Dimname dim, Tensor index, Tensor source) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, Dimname, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), dim, index, source);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::index_copy(Dimname dim, const Tensor & index, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_copy(const_cast<Tensor&>(*this), dim, index, source);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_copy.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), dim, index, source);
-#endif
-}
-#endif
-inline Tensor & Tensor::index_put_(TensorList indices, const Tensor & values, bool accumulate) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_put_(const_cast<Tensor&>(*this), indices, values, accumulate);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_put_(Tensor(a!) self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, TensorList, const Tensor &, bool>(const_cast<Tensor&>(*this), indices, values, accumulate);
-#endif
-}
-inline Tensor Tensor::index_put(TensorList indices, const Tensor & values, bool accumulate) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_put(const_cast<Tensor&>(*this), indices, values, accumulate);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_put(Tensor self, Tensor?[] indices, Tensor values, bool accumulate=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, TensorList, const Tensor &, bool>(const_cast<Tensor&>(*this), indices, values, accumulate);
-#endif
-}
-inline Tensor Tensor::inverse() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::inverse(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::inverse", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::isclose(const Tensor & other, double rtol, double atol, bool equal_nan) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::isclose(const_cast<Tensor&>(*this), other, rtol, atol, equal_nan);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::isclose", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, double, double, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, rtol, atol, equal_nan);
-#endif
-}
-inline bool Tensor::is_distributed() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_distributed(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_distributed", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline bool Tensor::is_floating_point() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_floating_point(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_floating_point", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline bool Tensor::is_complex() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_complex(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_complex", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline bool Tensor::is_nonzero() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_nonzero(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_nonzero", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline bool Tensor::is_same_size(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_same_size(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_same_size", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline bool Tensor::is_signed() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_signed(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_signed", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::kthvalue(int64_t k, int64_t dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::kthvalue(const_cast<Tensor&>(*this), k, dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::kthvalue", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), k, dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline std::tuple<Tensor,Tensor> Tensor::kthvalue(int64_t k, Dimname dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::kthvalue(const_cast<Tensor&>(*this), k, dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::kthvalue.dimname(Tensor self, int k, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)");
-    return table->callUnboxed<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, Dimname, bool>(const_cast<Tensor&>(*this), k, dim, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::log() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::log_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::log10() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log10(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log10", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::log10_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log10_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log10_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::log1p() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log1p(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log1p", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::log1p_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::log1p_(const_cast<Tensor&>(*this));
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::log1p_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("log1p_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log1p_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::log2() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log2(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log2", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::log2_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::log2_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("log2_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log2_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::logdet() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::logdet(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::logdet", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::log_softmax(int64_t dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log_softmax(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::log_softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::log_softmax(Dimname dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::log_softmax(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::log_softmax(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#endif
-inline Tensor Tensor::logsumexp(IntArrayRef dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::logsumexp(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::logsumexp", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::logsumexp(DimnameList dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::logsumexp(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::logsumexp.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::matmul(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::matmul(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::matmul", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::matrix_power(int64_t n) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::matrix_power(const_cast<Tensor&>(*this), n);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::matrix_power", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), n);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::max(int64_t dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::max(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::max", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-inline Tensor Tensor::max_values(IntArrayRef dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::max_values(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::max_values", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline std::tuple<Tensor,Tensor> Tensor::max(Dimname dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::max(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::max.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)");
-    return table->callUnboxed<std::tuple<Tensor,Tensor>, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::max_values(DimnameList dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::max_values(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::max_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::mean(c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::mean(const_cast<Tensor&>(*this), dtype);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::mean(const_cast<Tensor&>(*this), dtype);
-            break;
-        default:
-            AT_ERROR("mean not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::mean(Tensor self, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dtype);
-#endif
-}
-inline Tensor Tensor::mean(IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::mean(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::mean(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-            break;
-        default:
-            AT_ERROR("mean not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::mean.dim(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, IntArrayRef, bool, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::mean(DimnameList dim, bool keepdim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::mean(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-            break;
-        default:
-            AT_ERROR("mean not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::mean.names_dim(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, bool, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#endif
-}
-#endif
-inline std::tuple<Tensor,Tensor> Tensor::median(int64_t dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::median(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::median", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline std::tuple<Tensor,Tensor> Tensor::median(Dimname dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::median(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::median.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)");
-    return table->callUnboxed<std::tuple<Tensor,Tensor>, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-inline std::tuple<Tensor,Tensor> Tensor::min(int64_t dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::min(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::min", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-inline Tensor Tensor::min_values(IntArrayRef dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::min_values(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::min_values", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline std::tuple<Tensor,Tensor> Tensor::min(Dimname dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::min(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::min.names_dim(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)");
-    return table->callUnboxed<std::tuple<Tensor,Tensor>, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::min_values(DimnameList dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::min_values(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::min_values.names(Tensor self, Dimname[1] dim, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::mm(const Tensor & mat2) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::mm(const_cast<Tensor&>(*this), mat2);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::mm(const_cast<Tensor&>(*this), mat2);
-            break;
-        default:
-            AT_ERROR("mm not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat2)), const_cast<Tensor&>(*this), mat2);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::mode(int64_t dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::mode(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mode", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline std::tuple<Tensor,Tensor> Tensor::mode(Dimname dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::mode(const_cast<Tensor&>(*this), dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::mode.dimname(Tensor self, Dimname dim, bool keepdim=False) -> (Tensor values, Tensor indices)");
-    return table->callUnboxed<std::tuple<Tensor,Tensor>, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::mul(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::mul(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::mul(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("mul not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mul", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::mul_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::mul_(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::mul_(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("mul_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mul_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::mul(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::mul(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mul", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::mul_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::mul_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mul_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::mv(const Tensor & vec) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::mv(const_cast<Tensor&>(*this), vec);
-            break;
-        default:
-            AT_ERROR("mv not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mv", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, vec)), const_cast<Tensor&>(*this), vec);
-#endif
-}
-inline Tensor Tensor::mvlgamma(int64_t p) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::mvlgamma(const_cast<Tensor&>(*this), p);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mvlgamma", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p);
-#endif
-}
-inline Tensor & Tensor::mvlgamma_(int64_t p) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::mvlgamma_(const_cast<Tensor&>(*this), p);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::mvlgamma_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p);
-#endif
-}
-inline Tensor Tensor::narrow_copy(int64_t dim, int64_t start, int64_t length) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::narrow_copy(const_cast<Tensor&>(*this), dim, start, length);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::narrow_copy(const_cast<Tensor&>(*this), dim, start, length);
-            break;
-        default:
-            AT_ERROR("narrow_copy not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::narrow_copy", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, start, length);
-#endif
-}
-inline Tensor Tensor::narrow(int64_t dim, int64_t start, int64_t length) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::narrow(const_cast<Tensor&>(*this), dim, start, length);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::narrow", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, start, length);
-#endif
-}
-inline Tensor Tensor::permute(IntArrayRef dims) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::permute(const_cast<Tensor&>(*this), dims);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::permute", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dims);
-#endif
-}
-inline Tensor Tensor::numpy_T() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::numpy_T(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::numpy_T", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline bool Tensor::is_pinned() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::is_pinned(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_pinned", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::pin_memory() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::pin_memory(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::pin_memory", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::pinverse(double rcond) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::pinverse(const_cast<Tensor&>(*this), rcond);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::pinverse", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, double>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), rcond);
-#endif
-}
-inline Tensor Tensor::reciprocal() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::reciprocal(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::reciprocal", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::reciprocal_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::reciprocal_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("reciprocal_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::reciprocal_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::neg() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::neg(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::neg", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::neg_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::neg_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::neg_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::repeat(IntArrayRef repeats) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::repeat(const_cast<Tensor&>(*this), repeats);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::repeat", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), repeats);
-#endif
-}
-inline Tensor Tensor::repeat_interleave(const Tensor & repeats, c10::optional<int64_t> dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::repeat_interleave(const_cast<Tensor&>(*this), repeats, dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::repeat_interleave", "self_Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, c10::optional<int64_t>>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, repeats)), const_cast<Tensor&>(*this), repeats, dim);
-#endif
-}
-inline Tensor Tensor::repeat_interleave(int64_t repeats, c10::optional<int64_t> dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::repeat_interleave(const_cast<Tensor&>(*this), repeats, dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::repeat_interleave", "self_int"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, c10::optional<int64_t>>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), repeats, dim);
-#endif
-}
-inline Tensor Tensor::reshape(IntArrayRef shape) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::reshape(const_cast<Tensor&>(*this), shape);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::reshape", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), shape);
-#endif
-}
-inline Tensor Tensor::reshape_as(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::reshape_as(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::reshape_as", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::round() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::round(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::round", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::round_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::round_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::round_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::relu() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::relu(const_cast<Tensor&>(*this));
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::relu(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("relu not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::relu", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::relu_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::relu_(const_cast<Tensor&>(*this));
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::relu_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("relu_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::relu_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::prelu(const Tensor & weight) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::prelu(const_cast<Tensor&>(*this), weight);
-            break;
-        default:
-            AT_ERROR("prelu not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::prelu", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, weight)), const_cast<Tensor&>(*this), weight);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::prelu_backward(const Tensor & grad_output, const Tensor & weight) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::prelu_backward(grad_output, const_cast<Tensor&>(*this), weight);
-            break;
-        default:
-            AT_ERROR("prelu_backward not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::prelu_backward", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(grad_output, *this, weight)), grad_output, const_cast<Tensor&>(*this), weight);
-#endif
-}
-inline Tensor Tensor::hardshrink(Scalar lambd) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::hardshrink(const_cast<Tensor&>(*this), lambd);
-            break;
-        default:
-            AT_ERROR("hardshrink not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::hardshrink", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), lambd);
-#endif
-}
-inline Tensor Tensor::hardshrink_backward(const Tensor & grad_out, Scalar lambd) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::hardshrink_backward(grad_out, const_cast<Tensor&>(*this), lambd);
-            break;
-        default:
-            AT_ERROR("hardshrink_backward not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::hardshrink_backward", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(grad_out, *this)), grad_out, const_cast<Tensor&>(*this), lambd);
-#endif
-}
-inline Tensor Tensor::rsqrt() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::rsqrt(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::rsqrt", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::rsqrt_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::rsqrt_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::rsqrt_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::select(Dimname dim, int64_t index) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::select(const_cast<Tensor&>(*this), dim, index);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, int64_t>(const_cast<Tensor&>(*this), dim, index);
-#endif
-}
-#endif
-inline Tensor Tensor::select(int64_t dim, int64_t index) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::select(const_cast<Tensor&>(*this), dim, index);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::select", "int"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, index);
-#endif
-}
-inline Tensor Tensor::sigmoid() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sigmoid(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("sigmoid not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sigmoid", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::sigmoid_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sigmoid_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("sigmoid_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sigmoid_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::sin() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sin(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sin", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::sin_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sin_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("sin_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sin_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::sinh() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sinh(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sinh", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::sinh_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sinh_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("sinh_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sinh_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::detach() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::detach(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::detach", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::detach_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::detach_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::detach_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::size(int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::size(const_cast<Tensor&>(*this), dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::size", "int"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline int64_t Tensor::size(Dimname dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::size(const_cast<Tensor&>(*this), dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::size.Dimname(Tensor self, Dimname dim) -> int");
-    return table->callUnboxed<int64_t, const Tensor &, Dimname>(const_cast<Tensor&>(*this), dim);
-#endif
-}
-#endif
-inline Tensor Tensor::slice(int64_t dim, int64_t start, int64_t end, int64_t step) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::slice(const_cast<Tensor&>(*this), dim, start, end, step);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::slice", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, int64_t, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, start, end, step);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::slogdet() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::slogdet(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::slogdet", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::smm(const Tensor & mat2) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::smm(const_cast<Tensor&>(*this), mat2);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::smm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat2)), const_cast<Tensor&>(*this), mat2);
-#endif
-}
-inline Tensor Tensor::softmax(int64_t dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::softmax(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::softmax(Tensor self, int dim, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::softmax(Dimname dim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::softmax(const_cast<Tensor&>(*this), dim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::softmax(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, dtype);
-#endif
-}
-#endif
-inline std::vector<Tensor> Tensor::split(int64_t split_size, int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::split(const_cast<Tensor&>(*this), split_size, dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::split", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::vector<Tensor>, const Tensor &, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), split_size, dim);
-#endif
-}
-inline std::vector<Tensor> Tensor::split_with_sizes(IntArrayRef split_sizes, int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::split_with_sizes(const_cast<Tensor&>(*this), split_sizes, dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::split_with_sizes", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::vector<Tensor>, const Tensor &, IntArrayRef, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), split_sizes, dim);
-#endif
-}
-inline Tensor Tensor::squeeze() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::squeeze(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::squeeze", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::squeeze(int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::squeeze(const_cast<Tensor&>(*this), dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::squeeze", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::squeeze(Dimname dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::squeeze(const_cast<Tensor&>(*this), dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::squeeze.dimname(Tensor(a) self, Dimname dim) -> Tensor(a)");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname>(const_cast<Tensor&>(*this), dim);
-#endif
-}
-#endif
-inline Tensor & Tensor::squeeze_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::squeeze_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::squeeze_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::squeeze_(int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::squeeze_(const_cast<Tensor&>(*this), dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::squeeze_", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor & Tensor::squeeze_(Dimname dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::squeeze_(const_cast<Tensor&>(*this), dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::squeeze_.dimname(Tensor(a!) self, Dimname dim) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, Dimname>(const_cast<Tensor&>(*this), dim);
-#endif
-}
-#endif
-inline Tensor Tensor::sspaddmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sspaddmm(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sspaddmm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat1, mat2)), const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-#endif
-}
-inline Tensor Tensor::stft(int64_t n_fft, c10::optional<int64_t> hop_length, c10::optional<int64_t> win_length, const Tensor & window, bool normalized, bool onesided) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::stft(const_cast<Tensor&>(*this), n_fft, hop_length, win_length, window, normalized, onesided);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::stft(Tensor self, int n_fft, int? hop_length=None, int? win_length=None, Tensor? window=None, bool normalized=False, bool onesided=True) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, c10::optional<int64_t>, c10::optional<int64_t>, const Tensor &, bool, bool>(const_cast<Tensor&>(*this), n_fft, hop_length, win_length, window, normalized, onesided);
-#endif
-}
-inline int64_t Tensor::stride(int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::stride(const_cast<Tensor&>(*this), dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::stride", "int"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline int64_t Tensor::stride(Dimname dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::stride(const_cast<Tensor&>(*this), dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::stride.Dimname(Tensor self, Dimname dim) -> int");
-    return table->callUnboxed<int64_t, const Tensor &, Dimname>(const_cast<Tensor&>(*this), dim);
-#endif
-}
-#endif
-inline Tensor Tensor::sum(c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sum(const_cast<Tensor&>(*this), dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::sum(Tensor self, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dtype);
-#endif
-}
-inline Tensor Tensor::sum(IntArrayRef dim, bool keepdim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sum(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::sum.dim_IntList(Tensor self, int[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, IntArrayRef, bool, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::sum(DimnameList dim, bool keepdim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sum(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::sum.dim_DimnameList(Tensor self, Dimname[1] dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, bool, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#endif
-}
-#endif
-inline Tensor Tensor::sum_to_size(IntArrayRef size) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sum_to_size(const_cast<Tensor&>(*this), size);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sum_to_size", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size);
-#endif
-}
-inline Tensor Tensor::sqrt() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sqrt(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sqrt", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::sqrt_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sqrt_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("sqrt_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sqrt_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::std(bool unbiased) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::std(const_cast<Tensor&>(*this), unbiased);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::std", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), unbiased);
-#endif
-}
-inline Tensor Tensor::std(IntArrayRef dim, bool unbiased, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::std(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::std", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::std(DimnameList dim, bool unbiased, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::std(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::std.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, bool, bool>(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::prod(c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::prod(const_cast<Tensor&>(*this), dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::prod(Tensor self, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dtype);
-#endif
-}
-inline Tensor Tensor::prod(int64_t dim, bool keepdim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::prod(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::prod.dim_int(Tensor self, int dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, int64_t, bool, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::prod(Dimname dim, bool keepdim, c10::optional<ScalarType> dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::prod(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::prod.dim_Dimname(Tensor self, Dimname dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, bool, c10::optional<ScalarType>>(const_cast<Tensor&>(*this), dim, keepdim, dtype);
-#endif
-}
-#endif
-inline Tensor Tensor::t() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::t(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::t", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::t_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::t_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::t_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::tan() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::tan(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::tan", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::tan_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::tan_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("tan_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::tan_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::tanh() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::tanh(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::tanh", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::tanh_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::tanh_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("tanh_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::tanh_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::transpose(int64_t dim0, int64_t dim1) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::transpose(const_cast<Tensor&>(*this), dim0, dim1);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::transpose", "int"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim0, dim1);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::transpose(Dimname dim0, Dimname dim1) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::transpose(const_cast<Tensor&>(*this), dim0, dim1);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::transpose.Dimname(Tensor(a) self, Dimname dim0, Dimname dim1) -> Tensor(a)");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, Dimname>(const_cast<Tensor&>(*this), dim0, dim1);
-#endif
-}
-#endif
-inline Tensor & Tensor::transpose_(int64_t dim0, int64_t dim1) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::transpose_(const_cast<Tensor&>(*this), dim0, dim1);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::transpose_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim0, dim1);
-#endif
-}
-inline Tensor Tensor::flip(IntArrayRef dims) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::flip(const_cast<Tensor&>(*this), dims);
-            break;
-        default:
-            AT_ERROR("flip not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::flip", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dims);
-#endif
-}
-inline Tensor Tensor::roll(IntArrayRef shifts, IntArrayRef dims) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::roll(const_cast<Tensor&>(*this), shifts, dims);
-            break;
-        default:
-            AT_ERROR("roll not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::roll", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), shifts, dims);
-#endif
-}
-inline Tensor Tensor::rot90(int64_t k, IntArrayRef dims) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::rot90(const_cast<Tensor&>(*this), k, dims);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::rot90", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), k, dims);
-#endif
-}
-inline Tensor Tensor::trunc() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::trunc(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::trunc", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::trunc_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::trunc_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::trunc_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::type_as(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::type_as(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::type_as", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::unsqueeze(int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::unsqueeze(const_cast<Tensor&>(*this), dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::unsqueeze", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim);
-#endif
-}
-inline Tensor & Tensor::unsqueeze_(int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::unsqueeze_(const_cast<Tensor&>(*this), dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::unsqueeze_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim);
-#endif
-}
-inline Tensor Tensor::var(bool unbiased) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::var(const_cast<Tensor&>(*this), unbiased);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::var", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), unbiased);
-#endif
-}
-inline Tensor Tensor::var(IntArrayRef dim, bool unbiased, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::var(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::var", "dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::var(DimnameList dim, bool unbiased, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::var(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::var.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, DimnameList, bool, bool>(const_cast<Tensor&>(*this), dim, unbiased, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::view_as(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::view_as(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::view_as", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::where(const Tensor & condition, const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::where(condition, const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::where", "self"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(condition, *this, other)), condition, const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::norm(c10::optional<Scalar> p, ScalarType dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::norm(const_cast<Tensor&>(*this), p, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::norm.ScalarOpt_dtype(Tensor self, Scalar? p, *, ScalarType dtype) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<Scalar>, ScalarType>(const_cast<Tensor&>(*this), p, dtype);
-#endif
-}
-inline Tensor Tensor::norm(Scalar p) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::norm(const_cast<Tensor&>(*this), p);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::norm", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p);
-#endif
-}
-inline Tensor Tensor::norm(c10::optional<Scalar> p, IntArrayRef dim, bool keepdim, ScalarType dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::norm(const_cast<Tensor&>(*this), p, dim, keepdim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::norm.ScalarOpt_dim_dtype(Tensor self, Scalar? p, int[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<Scalar>, IntArrayRef, bool, ScalarType>(const_cast<Tensor&>(*this), p, dim, keepdim, dtype);
-#endif
-}
-inline Tensor Tensor::norm(c10::optional<Scalar> p, IntArrayRef dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::norm(const_cast<Tensor&>(*this), p, dim, keepdim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::norm", "ScalarOpt_dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, c10::optional<Scalar>, IntArrayRef, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p, dim, keepdim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::norm(c10::optional<Scalar> p, DimnameList dim, bool keepdim, ScalarType dtype) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::norm(const_cast<Tensor&>(*this), p, dim, keepdim, dtype);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::norm.names_ScalarOpt_dim_dtype(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim, *, ScalarType dtype) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<Scalar>, DimnameList, bool, ScalarType>(const_cast<Tensor&>(*this), p, dim, keepdim, dtype);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::norm(c10::optional<Scalar> p, DimnameList dim, bool keepdim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::norm(const_cast<Tensor&>(*this), p, dim, keepdim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::norm.names_ScalarOpt_dim(Tensor self, Scalar? p, Dimname[1] dim, bool keepdim=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<Scalar>, DimnameList, bool>(const_cast<Tensor&>(*this), p, dim, keepdim);
-#endif
-}
-#endif
-inline Tensor Tensor::clone(c10::optional<MemoryFormat> memory_format) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::clone(const_cast<Tensor&>(*this), memory_format);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::clone(const_cast<Tensor&>(*this), memory_format);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::clone(const_cast<Tensor&>(*this), memory_format);
-            break;
-        default:
-            AT_ERROR("clone not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, c10::optional<MemoryFormat>>(const_cast<Tensor&>(*this), memory_format);
-#endif
-}
-inline Tensor & Tensor::resize_as_(const Tensor & the_template) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::resize_as_(const_cast<Tensor&>(*this), the_template);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::resize_as_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, the_template)), const_cast<Tensor&>(*this), the_template);
-#endif
-}
-inline Tensor Tensor::pow(Scalar exponent) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::pow(const_cast<Tensor&>(*this), exponent);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::pow(const_cast<Tensor&>(*this), exponent);
-            break;
-        default:
-            AT_ERROR("pow not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::pow", "Tensor_Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), exponent);
-#endif
-}
-inline Tensor & Tensor::zero_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::zero_(const_cast<Tensor&>(*this));
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::zero_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("zero_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::zero_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::sub(const Tensor & other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sub(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::sub(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        default:
-            AT_ERROR("sub not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sub", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor & Tensor::sub_(const Tensor & other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sub_(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::sub_(const_cast<Tensor&>(*this), other, alpha);
-            break;
-        default:
-            AT_ERROR("sub_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sub_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor Tensor::sub(Scalar other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sub(const_cast<Tensor&>(*this), other, alpha);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sub", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor & Tensor::sub_(Scalar other, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sub_(const_cast<Tensor&>(*this), other, alpha);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sub_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other, alpha);
-#endif
-}
-inline Tensor Tensor::addmm(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::addmm(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::addmm(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-            break;
-        default:
-            AT_ERROR("addmm not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addmm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat1, mat2)), const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-#endif
-}
-inline Tensor & Tensor::addmm_(const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::addmm_(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::addmm_(const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-            break;
-        default:
-            AT_ERROR("addmm_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addmm_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mat1, mat2)), const_cast<Tensor&>(*this), mat1, mat2, beta, alpha);
-#endif
-}
-inline Tensor & Tensor::sparse_resize_(IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::sparse_resize_(const_cast<Tensor&>(*this), size, sparse_dim, dense_dim);
-            break;
-        default:
-            AT_ERROR("sparse_resize_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sparse_resize_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, IntArrayRef, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size, sparse_dim, dense_dim);
-#endif
-}
-inline Tensor & Tensor::sparse_resize_and_clear_(IntArrayRef size, int64_t sparse_dim, int64_t dense_dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::sparse_resize_and_clear_(const_cast<Tensor&>(*this), size, sparse_dim, dense_dim);
-            break;
-        default:
-            AT_ERROR("sparse_resize_and_clear_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sparse_resize_and_clear_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, IntArrayRef, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size, sparse_dim, dense_dim);
-#endif
-}
-inline Tensor Tensor::sparse_mask(const Tensor & mask) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::sparse_mask(const_cast<Tensor&>(*this), mask);
-            break;
-        default:
-            AT_ERROR("sparse_mask not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sparse_mask", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask)), const_cast<Tensor&>(*this), mask);
-#endif
-}
-inline Tensor Tensor::to_dense() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::to_dense(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("to_dense not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::to_dense", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::sparse_dim() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::sparse_dim(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("sparse_dim not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sparse_dim", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::_dimI() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::_dimI(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("_dimI not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::_dimI", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::dense_dim() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::dense_dim(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("dense_dim not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::dense_dim", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::_dimV() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::_dimV(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("_dimV not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::_dimV", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::_nnz() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::_nnz(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("_nnz not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::_nnz", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::coalesce() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::coalesce(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("coalesce not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::coalesce", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline bool Tensor::is_coalesced() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::is_coalesced(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("is_coalesced not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_coalesced", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::_indices() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::_indices(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("_indices not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::_indices", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::_values() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::_values(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("_values not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::_values", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::_coalesced_(bool coalesced) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::_coalesced_(const_cast<Tensor&>(*this), coalesced);
-            break;
-        default:
-            AT_ERROR("_coalesced_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::_coalesced_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), coalesced);
-#endif
-}
-inline Tensor Tensor::indices() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::indices(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("indices not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::indices", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::values() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::SparseCPU:
-            return SparseCPUType::values(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("values not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::values", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::numel() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::numel(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::numel", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline std::vector<Tensor> Tensor::unbind(int64_t dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::unbind(const_cast<Tensor&>(*this), dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::unbind", "int"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::vector<Tensor>, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline std::vector<Tensor> Tensor::unbind(Dimname dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::unbind(const_cast<Tensor&>(*this), dim);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::unbind.Dimname(Tensor(a) self, Dimname dim) -> Tensor(a)[]");
-    return table->callUnboxed<std::vector<Tensor>, const Tensor &, Dimname>(const_cast<Tensor&>(*this), dim);
-#endif
-}
-#endif
-inline Tensor Tensor::to_sparse(int64_t sparse_dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::to_sparse(const_cast<Tensor&>(*this), sparse_dim);
-            break;
-        default:
-            AT_ERROR("to_sparse not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::to_sparse", "sparse_dim"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), sparse_dim);
-#endif
-}
-inline Tensor Tensor::to_sparse() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::to_sparse(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("to_sparse not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::to_sparse", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::to_mkldnn() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::to_mkldnn(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("to_mkldnn not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::to_mkldnn", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::dequantize() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::dequantize(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("dequantize not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::dequantize", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline double Tensor::q_scale() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::q_scale(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("q_scale not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::q_scale", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<double, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::q_zero_point() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::q_zero_point(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("q_zero_point not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::q_zero_point", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::q_per_channel_scales() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::q_per_channel_scales(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("q_per_channel_scales not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::q_per_channel_scales", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::q_per_channel_zero_points() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::q_per_channel_zero_points(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("q_per_channel_zero_points not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::q_per_channel_zero_points", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline int64_t Tensor::q_per_channel_axis() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::q_per_channel_axis(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("q_per_channel_axis not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::q_per_channel_axis(Tensor self) -> int");
-    return table->callUnboxed<int64_t, const Tensor &>(const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::int_repr() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::int_repr(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("int_repr not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::int_repr", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline QScheme Tensor::qscheme() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::qscheme(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("qscheme not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::qscheme(Tensor self) -> QScheme");
-    return table->callUnboxed<QScheme, const Tensor &>(const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::to(const TensorOptions & options, bool non_blocking, bool copy) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::to(const_cast<Tensor&>(*this), options, non_blocking, copy);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::to.dtype_layout(Tensor self, *, ScalarType dtype, Layout layout, Device device, bool pin_memory=False, bool non_blocking=False, bool copy=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, const TensorOptions &, bool, bool>(const_cast<Tensor&>(*this), options, non_blocking, copy);
-#endif
-}
-inline Tensor Tensor::to(Device device, ScalarType dtype, bool non_blocking, bool copy) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::to(const_cast<Tensor&>(*this), device, dtype, non_blocking, copy);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Device, ScalarType, bool, bool>(const_cast<Tensor&>(*this), device, dtype, non_blocking, copy);
-#endif
-}
-inline Tensor Tensor::to(ScalarType dtype, bool non_blocking, bool copy) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::to(const_cast<Tensor&>(*this), dtype, non_blocking, copy);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, ScalarType, bool, bool>(const_cast<Tensor&>(*this), dtype, non_blocking, copy);
-#endif
-}
-inline Tensor Tensor::to(const Tensor & other, bool non_blocking, bool copy) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::to(const_cast<Tensor&>(*this), other, non_blocking, copy);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::to", "other"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, non_blocking, copy);
-#endif
-}
-inline Scalar Tensor::item() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::item(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::item", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Scalar, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::set_(Storage source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::set_(const_cast<Tensor&>(*this), source);
-            break;
-        default:
-            AT_ERROR("set_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::set_.source_Storage(Tensor(a!) self, Storage source) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, Storage>(const_cast<Tensor&>(*this), source);
-#endif
-}
-inline Tensor & Tensor::set_(Storage source, int64_t storage_offset, IntArrayRef size, IntArrayRef stride) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::set_(const_cast<Tensor&>(*this), source, storage_offset, size, stride);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::set_(const_cast<Tensor&>(*this), source, storage_offset, size, stride);
-            break;
-        default:
-            AT_ERROR("set_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, int storage_offset, int[] size, int[] stride=[]) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, Storage, int64_t, IntArrayRef, IntArrayRef>(const_cast<Tensor&>(*this), source, storage_offset, size, stride);
-#endif
-}
-inline Tensor & Tensor::set_(const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::set_(const_cast<Tensor&>(*this), source);
-            break;
-        default:
-            AT_ERROR("set_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::set_", "source_Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, source)), const_cast<Tensor&>(*this), source);
-#endif
-}
-inline Tensor & Tensor::set_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::set_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("set_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::set_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::set_quantizer_(ConstQuantizerPtr quantizer) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::set_quantizer_(const_cast<Tensor&>(*this), quantizer);
-            break;
-        default:
-            AT_ERROR("set_quantizer_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::set_quantizer_(Tensor(a!) self, ConstQuantizerPtr quantizer) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, ConstQuantizerPtr>(const_cast<Tensor&>(*this), quantizer);
-#endif
-}
-inline bool Tensor::is_set_to(const Tensor & tensor) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::is_set_to(const_cast<Tensor&>(*this), tensor);
-            break;
-        default:
-            AT_ERROR("is_set_to not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::is_set_to", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, tensor)), const_cast<Tensor&>(*this), tensor);
-#endif
-}
-inline Tensor & Tensor::masked_fill_(const Tensor & mask, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::masked_fill_(const_cast<Tensor&>(*this), mask, value);
-            break;
-        default:
-            AT_ERROR("masked_fill_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::masked_fill_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask)), const_cast<Tensor&>(*this), mask, value);
-#endif
-}
-inline Tensor Tensor::masked_fill(const Tensor & mask, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::masked_fill(const_cast<Tensor&>(*this), mask, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::masked_fill", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask)), const_cast<Tensor&>(*this), mask, value);
-#endif
-}
-inline Tensor & Tensor::masked_fill_(const Tensor & mask, const Tensor & value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::masked_fill_(const_cast<Tensor&>(*this), mask, value);
-            break;
-        default:
-            AT_ERROR("masked_fill_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::masked_fill_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask, value)), const_cast<Tensor&>(*this), mask, value);
-#endif
-}
-inline Tensor Tensor::masked_fill(const Tensor & mask, const Tensor & value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::masked_fill(const_cast<Tensor&>(*this), mask, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::masked_fill", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask, value)), const_cast<Tensor&>(*this), mask, value);
-#endif
-}
-inline Tensor & Tensor::masked_scatter_(const Tensor & mask, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::masked_scatter_(const_cast<Tensor&>(*this), mask, source);
-            break;
-        default:
-            AT_ERROR("masked_scatter_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::masked_scatter_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask, source)), const_cast<Tensor&>(*this), mask, source);
-#endif
-}
-inline Tensor Tensor::masked_scatter(const Tensor & mask, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::masked_scatter(const_cast<Tensor&>(*this), mask, source);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::masked_scatter", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask, source)), const_cast<Tensor&>(*this), mask, source);
-#endif
-}
-inline Tensor Tensor::view(IntArrayRef size) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::view(const_cast<Tensor&>(*this), size);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::view(const_cast<Tensor&>(*this), size);
-            break;
-        default:
-            AT_ERROR("view not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::view", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, IntArrayRef>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), size);
-#endif
-}
-inline Tensor & Tensor::put_(const Tensor & index, const Tensor & source, bool accumulate) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::put_(const_cast<Tensor&>(*this), index, source, accumulate);
-            break;
-        default:
-            AT_ERROR("put_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::put_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, source)), const_cast<Tensor&>(*this), index, source, accumulate);
-#endif
-}
-inline Tensor & Tensor::index_add_(int64_t dim, const Tensor & index, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::index_add_(const_cast<Tensor&>(*this), dim, index, source);
-            break;
-        default:
-            AT_ERROR("index_add_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_add_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, source)), const_cast<Tensor&>(*this), dim, index, source);
-#endif
-}
-inline Tensor Tensor::index_add(int64_t dim, const Tensor & index, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_add(const_cast<Tensor&>(*this), dim, index, source);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_add", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, source)), const_cast<Tensor&>(*this), dim, index, source);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::index_add(Dimname dim, const Tensor & index, const Tensor & source) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_add(const_cast<Tensor&>(*this), dim, index, source);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor source) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), dim, index, source);
-#endif
-}
-#endif
-inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::index_fill_(const_cast<Tensor&>(*this), dim, index, value);
-            break;
-        default:
-            AT_ERROR("index_fill_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_fill_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index)), const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-inline Tensor Tensor::index_fill(int64_t dim, const Tensor & index, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_fill(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_fill", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index)), const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-inline Tensor & Tensor::index_fill_(int64_t dim, const Tensor & index, const Tensor & value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::index_fill_(const_cast<Tensor&>(*this), dim, index, value);
-            break;
-        default:
-            AT_ERROR("index_fill_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_fill_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, value)), const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-inline Tensor Tensor::index_fill(int64_t dim, const Tensor & index, const Tensor & value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_fill(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_fill", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, value)), const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor & Tensor::index_fill_(Dimname dim, const Tensor & index, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_fill_(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_fill_.dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Scalar value) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, Dimname, const Tensor &, Scalar>(const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor & Tensor::index_fill_(Dimname dim, const Tensor & index, const Tensor & value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_fill_(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_fill_.dimname_Scalar(Tensor(a!) self, Dimname dim, Tensor index, Tensor value) -> Tensor(a!)");
-    return table->callUnboxed<Tensor &, Tensor &, Dimname, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::index_fill(Dimname dim, const Tensor & index, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_fill(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_fill.dimname_Scalar(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, Scalar>(const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::index_fill(Dimname dim, const Tensor & index, const Tensor & value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_fill(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_fill.dimname_Tensor(Tensor self, Dimname dim, Tensor index, Tensor value) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-#endif
-inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, const Tensor & src) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::scatter_(const_cast<Tensor&>(*this), dim, index, src);
-            break;
-        default:
-            AT_ERROR("scatter_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::scatter_", "src"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, src)), const_cast<Tensor&>(*this), dim, index, src);
-#endif
-}
-inline Tensor Tensor::scatter(int64_t dim, const Tensor & index, const Tensor & src) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::scatter(const_cast<Tensor&>(*this), dim, index, src);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::scatter", "src"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, src)), const_cast<Tensor&>(*this), dim, index, src);
-#endif
-}
-inline Tensor & Tensor::scatter_(int64_t dim, const Tensor & index, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::scatter_(const_cast<Tensor&>(*this), dim, index, value);
-            break;
-        default:
-            AT_ERROR("scatter_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::scatter_", "value"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index)), const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-inline Tensor Tensor::scatter(int64_t dim, const Tensor & index, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::scatter(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::scatter", "value"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index)), const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::scatter(Dimname dim, const Tensor & index, const Tensor & src) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::scatter(const_cast<Tensor&>(*this), dim, index, src);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::scatter.dimname_src(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), dim, index, src);
-#endif
-}
-#endif
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::scatter(Dimname dim, const Tensor & index, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::scatter(const_cast<Tensor&>(*this), dim, index, value);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::scatter.dimname_value(Tensor self, Dimname dim, Tensor index, Scalar value) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, Scalar>(const_cast<Tensor&>(*this), dim, index, value);
-#endif
-}
-#endif
-inline Tensor & Tensor::scatter_add_(int64_t dim, const Tensor & index, const Tensor & src) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::scatter_add_(const_cast<Tensor&>(*this), dim, index, src);
-            break;
-        default:
-            AT_ERROR("scatter_add_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::scatter_add_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, src)), const_cast<Tensor&>(*this), dim, index, src);
-#endif
-}
-inline Tensor Tensor::scatter_add(int64_t dim, const Tensor & index, const Tensor & src) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::scatter_add(const_cast<Tensor&>(*this), dim, index, src);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::scatter_add", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index, src)), const_cast<Tensor&>(*this), dim, index, src);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::scatter_add(Dimname dim, const Tensor & index, const Tensor & src) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::scatter_add(const_cast<Tensor&>(*this), dim, index, src);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::scatter_add.dimname(Tensor self, Dimname dim, Tensor index, Tensor src) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, const Tensor &>(const_cast<Tensor&>(*this), dim, index, src);
-#endif
-}
-#endif
-inline Tensor & Tensor::lt_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::lt_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lt_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::lt_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::lt_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lt_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::gt_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::gt_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::gt_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::gt_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::gt_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::gt_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::le_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::le_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::le_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::le_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::le_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::le_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::ge_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::ge_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ge_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::ge_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::ge_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ge_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::eq_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::eq_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::eq_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::eq_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::eq_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::eq_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::ne_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::ne_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ne_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::ne_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::ne_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ne_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__and__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__and__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__and__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__and__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__and__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__and__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__and__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__and__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__iand__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__iand__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__iand__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__iand__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__iand__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__iand__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__iand__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__iand__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__or__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__or__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__or__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__or__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__or__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__or__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__or__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__or__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__ior__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__ior__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__ior__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__ior__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__ior__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__ior__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__ior__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__ior__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__xor__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__xor__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__xor__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__xor__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__xor__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__xor__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__xor__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__xor__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__ixor__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__ixor__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__ixor__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__ixor__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__ixor__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__ixor__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__ixor__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__ixor__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__lshift__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__lshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__lshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__lshift__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__lshift__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__lshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__lshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__lshift__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__ilshift__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__ilshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__ilshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__ilshift__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__ilshift__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__ilshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__ilshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__ilshift__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__rshift__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__rshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__rshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__rshift__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::__rshift__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__rshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__rshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__rshift__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__irshift__(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__irshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__irshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__irshift__", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::__irshift__(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::__irshift__(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("__irshift__ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::__irshift__", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::lgamma_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lgamma_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("lgamma_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lgamma_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::atan2_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::atan2_(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::atan2_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::tril_(int64_t diagonal) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::tril_(const_cast<Tensor&>(*this), diagonal);
-            break;
-        default:
-            AT_ERROR("tril_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::tril_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), diagonal);
-#endif
-}
-inline Tensor & Tensor::triu_(int64_t diagonal) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::triu_(const_cast<Tensor&>(*this), diagonal);
-            break;
-        default:
-            AT_ERROR("triu_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::triu_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), diagonal);
-#endif
-}
-inline Tensor & Tensor::digamma_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::digamma_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::digamma_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::polygamma_(int64_t n) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::polygamma_(const_cast<Tensor&>(*this), n);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::polygamma_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), n);
-#endif
-}
-inline Tensor & Tensor::renorm_(Scalar p, int64_t dim, Scalar maxnorm) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::renorm_(const_cast<Tensor&>(*this), p, dim, maxnorm);
-            break;
-        default:
-            AT_ERROR("renorm_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::renorm_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar, int64_t, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p, dim, maxnorm);
-#endif
-}
-inline Tensor & Tensor::pow_(Scalar exponent) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::pow_(const_cast<Tensor&>(*this), exponent);
-            break;
-        default:
-            AT_ERROR("pow_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::pow_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), exponent);
-#endif
-}
-inline Tensor & Tensor::pow_(const Tensor & exponent) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::pow_(const_cast<Tensor&>(*this), exponent);
-            break;
-        default:
-            AT_ERROR("pow_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::pow_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, exponent)), const_cast<Tensor&>(*this), exponent);
-#endif
-}
-inline Tensor & Tensor::lerp_(const Tensor & end, Scalar weight) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lerp_(const_cast<Tensor&>(*this), end, weight);
-            break;
-        default:
-            AT_ERROR("lerp_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lerp_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, end)), const_cast<Tensor&>(*this), end, weight);
-#endif
-}
-inline Tensor & Tensor::lerp_(const Tensor & end, const Tensor & weight) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lerp_(const_cast<Tensor&>(*this), end, weight);
-            break;
-        default:
-            AT_ERROR("lerp_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lerp_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, end, weight)), const_cast<Tensor&>(*this), end, weight);
-#endif
-}
-inline Tensor & Tensor::fmod_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::fmod_(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("fmod_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fmod_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::fmod_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::fmod_(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("fmod_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fmod_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::remainder_(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::remainder_(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("remainder_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::remainder_", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::remainder_(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::remainder_(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("remainder_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::remainder_", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor & Tensor::addbmm_(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::addbmm_(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-            break;
-        default:
-            AT_ERROR("addbmm_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addbmm_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, batch1, batch2)), const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-#endif
-}
-inline Tensor Tensor::addbmm(const Tensor & batch1, const Tensor & batch2, Scalar beta, Scalar alpha) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::addbmm(const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-            break;
-        default:
-            AT_ERROR("addbmm not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addbmm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, batch1, batch2)), const_cast<Tensor&>(*this), batch1, batch2, beta, alpha);
-#endif
-}
-inline Tensor & Tensor::addcdiv_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::addcdiv_(const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addcdiv_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, tensor1, tensor2)), const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#endif
-}
-inline Tensor & Tensor::random_(int64_t from, int64_t to, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::random_(const_cast<Tensor&>(*this), from, to, generator);
-            break;
-        default:
-            AT_ERROR("random_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::random_", "from"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, int64_t, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), from, to, generator);
-#endif
-}
-inline Tensor & Tensor::random_(int64_t to, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::random_(const_cast<Tensor&>(*this), to, generator);
-            break;
-        default:
-            AT_ERROR("random_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::random_", "to"}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, int64_t, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), to, generator);
-#endif
-}
-inline Tensor & Tensor::random_(Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::random_(const_cast<Tensor&>(*this), generator);
-            break;
-        default:
-            AT_ERROR("random_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::random_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), generator);
-#endif
-}
-inline Tensor & Tensor::uniform_(double from, double to, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::uniform_(const_cast<Tensor&>(*this), from, to, generator);
-            break;
-        default:
-            AT_ERROR("uniform_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::uniform_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), from, to, generator);
-#endif
-}
-inline Tensor & Tensor::normal_(double mean, double std, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::normal_(const_cast<Tensor&>(*this), mean, std, generator);
-            break;
-        default:
-            AT_ERROR("normal_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::normal_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), mean, std, generator);
-#endif
-}
-inline Tensor & Tensor::cauchy_(double median, double sigma, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::cauchy_(const_cast<Tensor&>(*this), median, sigma, generator);
-            break;
-        default:
-            AT_ERROR("cauchy_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cauchy_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), median, sigma, generator);
-#endif
-}
-inline Tensor & Tensor::log_normal_(double mean, double std, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::log_normal_(const_cast<Tensor&>(*this), mean, std, generator);
-            break;
-        default:
-            AT_ERROR("log_normal_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::log_normal_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), mean, std, generator);
-#endif
-}
-inline Tensor & Tensor::exponential_(double lambd, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::exponential_(const_cast<Tensor&>(*this), lambd, generator);
-            break;
-        default:
-            AT_ERROR("exponential_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::exponential_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), lambd, generator);
-#endif
-}
-inline Tensor & Tensor::geometric_(double p, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::geometric_(const_cast<Tensor&>(*this), p, generator);
-            break;
-        default:
-            AT_ERROR("geometric_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::geometric_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, double, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p, generator);
-#endif
-}
-inline Tensor Tensor::diag(int64_t diagonal) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::diag(const_cast<Tensor&>(*this), diagonal);
-            break;
-        default:
-            AT_ERROR("diag not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::diag", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), diagonal);
-#endif
-}
-inline Tensor Tensor::cross(const Tensor & other, c10::optional<int64_t> dim) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cross(const_cast<Tensor&>(*this), other, dim);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cross", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, c10::optional<int64_t>>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, dim);
-#endif
-}
-inline Tensor Tensor::triu(int64_t diagonal) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::triu(const_cast<Tensor&>(*this), diagonal);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::triu", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), diagonal);
-#endif
-}
-inline Tensor Tensor::tril(int64_t diagonal) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::tril(const_cast<Tensor&>(*this), diagonal);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::tril", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), diagonal);
-#endif
-}
-inline Tensor Tensor::trace() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::trace(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("trace not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::trace", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::ne(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::ne(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::ne(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("ne not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ne", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::ne(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::ne(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::ne(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("ne not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ne", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::eq(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::eq(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::eq(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("eq not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::eq", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::eq(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::eq(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::eq(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("eq not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::eq", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::ge(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::ge(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::ge(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("ge not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ge", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::ge(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::ge(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::ge(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("ge not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ge", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::le(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::le(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::le(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("le not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::le", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::le(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::le(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::le(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("le not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::le", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::gt(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::gt(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::gt(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("gt not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::gt", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::gt(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::gt(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::gt(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("gt not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::gt", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::lt(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lt(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::lt(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("lt not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lt", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::lt(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lt(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::lt(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("lt not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lt", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::take(const Tensor & index) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::take(const_cast<Tensor&>(*this), index);
-            break;
-        default:
-            AT_ERROR("take not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::take", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index)), const_cast<Tensor&>(*this), index);
-#endif
-}
-inline Tensor Tensor::index_select(int64_t dim, const Tensor & index) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::index_select(const_cast<Tensor&>(*this), dim, index);
-            break;
-        case Backend::SparseCPU:
-            return SparseCPUType::index_select(const_cast<Tensor&>(*this), dim, index);
-            break;
-        default:
-            AT_ERROR("index_select not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::index_select", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index)), const_cast<Tensor&>(*this), dim, index);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::index_select(Dimname dim, const Tensor & index) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::index_select(const_cast<Tensor&>(*this), dim, index);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::index_select.dimname(Tensor self, Dimname dim, Tensor index) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &>(const_cast<Tensor&>(*this), dim, index);
-#endif
-}
-#endif
-inline Tensor Tensor::masked_select(const Tensor & mask) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::masked_select(const_cast<Tensor&>(*this), mask);
-            break;
-        default:
-            AT_ERROR("masked_select not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::masked_select", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, mask)), const_cast<Tensor&>(*this), mask);
-#endif
-}
-inline Tensor Tensor::nonzero() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::nonzero(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("nonzero not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::nonzero", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline std::vector<Tensor> Tensor::nonzero_numpy() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::nonzero_numpy(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::nonzero_numpy", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::vector<Tensor>, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::gather(int64_t dim, const Tensor & index, bool sparse_grad) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::gather(const_cast<Tensor&>(*this), dim, index, sparse_grad);
-            break;
-        default:
-            AT_ERROR("gather not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::gather", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, index)), const_cast<Tensor&>(*this), dim, index, sparse_grad);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::gather(Dimname dim, const Tensor & index, bool sparse_grad) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::gather(const_cast<Tensor&>(*this), dim, index, sparse_grad);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::gather.dimname(Tensor self, Dimname dim, Tensor index, *, bool sparse_grad=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, const Tensor &, bool>(const_cast<Tensor&>(*this), dim, index, sparse_grad);
-#endif
-}
-#endif
-inline Tensor Tensor::addcmul(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::addcmul(const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addcmul", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, tensor1, tensor2)), const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#endif
-}
-inline Tensor & Tensor::addcmul_(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::addcmul_(const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addcmul_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, tensor1, tensor2)), const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#endif
-}
-inline Tensor Tensor::addcdiv(const Tensor & tensor1, const Tensor & tensor2, Scalar value) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::addcdiv(const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::addcdiv", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, tensor1, tensor2)), const_cast<Tensor&>(*this), tensor1, tensor2, value);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::lstsq(const Tensor & A) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lstsq(const_cast<Tensor&>(*this), A);
-            break;
-        default:
-            AT_ERROR("lstsq not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lstsq", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, A)), const_cast<Tensor&>(*this), A);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::triangular_solve(const Tensor & A, bool upper, bool transpose, bool unitriangular) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::triangular_solve(const_cast<Tensor&>(*this), A, upper, transpose, unitriangular);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::triangular_solve", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, const Tensor &, bool, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, A)), const_cast<Tensor&>(*this), A, upper, transpose, unitriangular);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::symeig(bool eigenvectors, bool upper) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::symeig(const_cast<Tensor&>(*this), eigenvectors, upper);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::symeig", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), eigenvectors, upper);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::eig(bool eigenvectors) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::eig(const_cast<Tensor&>(*this), eigenvectors);
-            break;
-        default:
-            AT_ERROR("eig not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::eig", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), eigenvectors);
-#endif
-}
-inline std::tuple<Tensor,Tensor,Tensor> Tensor::svd(bool some, bool compute_uv) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::svd(const_cast<Tensor&>(*this), some, compute_uv);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::svd", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor,Tensor>, const Tensor &, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), some, compute_uv);
-#endif
-}
-inline Tensor Tensor::cholesky(bool upper) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cholesky(const_cast<Tensor&>(*this), upper);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cholesky", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), upper);
-#endif
-}
-inline Tensor Tensor::cholesky_solve(const Tensor & input2, bool upper) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::cholesky_solve(const_cast<Tensor&>(*this), input2, upper);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cholesky_solve", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, input2)), const_cast<Tensor&>(*this), input2, upper);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::solve(const Tensor & A) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::solve(const_cast<Tensor&>(*this), A);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::solve", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, A)), const_cast<Tensor&>(*this), A);
-#endif
-}
-inline Tensor Tensor::cholesky_inverse(bool upper) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::cholesky_inverse(const_cast<Tensor&>(*this), upper);
-            break;
-        default:
-            AT_ERROR("cholesky_inverse not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::cholesky_inverse", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), upper);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::qr(bool some) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::qr(const_cast<Tensor&>(*this), some);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::qr", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), some);
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::geqrf() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::geqrf(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("geqrf not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::geqrf", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::orgqr(const Tensor & input2) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::orgqr(const_cast<Tensor&>(*this), input2);
-            break;
-        default:
-            AT_ERROR("orgqr not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::orgqr", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, input2)), const_cast<Tensor&>(*this), input2);
-#endif
-}
-inline Tensor Tensor::ormqr(const Tensor & input2, const Tensor & input3, bool left, bool transpose) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::ormqr(const_cast<Tensor&>(*this), input2, input3, left, transpose);
-            break;
-        default:
-            AT_ERROR("ormqr not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::ormqr", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, input2, input3)), const_cast<Tensor&>(*this), input2, input3, left, transpose);
-#endif
-}
-inline Tensor Tensor::lu_solve(const Tensor & LU_data, const Tensor & LU_pivots) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::lu_solve(const_cast<Tensor&>(*this), LU_data, LU_pivots);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lu_solve", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, LU_data, LU_pivots)), const_cast<Tensor&>(*this), LU_data, LU_pivots);
-#endif
-}
-inline Tensor Tensor::multinomial(int64_t num_samples, bool replacement, Generator * generator) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::multinomial(const_cast<Tensor&>(*this), num_samples, replacement, generator);
-            break;
-        default:
-            AT_ERROR("multinomial not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::multinomial", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, bool, Generator *>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), num_samples, replacement, generator);
-#endif
-}
-inline Tensor Tensor::lgamma() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lgamma(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("lgamma not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lgamma", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::digamma() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::digamma(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::digamma", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::polygamma(int64_t n) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::polygamma(n, const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::polygamma", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, int64_t, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), n, const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::erfinv() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::erfinv(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("erfinv not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::erfinv", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::erfinv_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::erfinv_(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("erfinv_ not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::erfinv_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::sign() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sign(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sign", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor & Tensor::sign_() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sign_(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sign_", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor &, Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::dist(const Tensor & other, Scalar p) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::dist(const_cast<Tensor&>(*this), other, p);
-            break;
-        default:
-            AT_ERROR("dist not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::dist", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other, p);
-#endif
-}
-inline Tensor Tensor::atan2(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::atan2(const_cast<Tensor&>(*this), other);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::atan2", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::lerp(const Tensor & end, Scalar weight) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lerp(const_cast<Tensor&>(*this), end, weight);
-            break;
-        default:
-            AT_ERROR("lerp not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lerp", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, end)), const_cast<Tensor&>(*this), end, weight);
-#endif
-}
-inline Tensor Tensor::lerp(const Tensor & end, const Tensor & weight) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::lerp(const_cast<Tensor&>(*this), end, weight);
-            break;
-        default:
-            AT_ERROR("lerp not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::lerp", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, end, weight)), const_cast<Tensor&>(*this), end, weight);
-#endif
-}
-inline Tensor Tensor::histc(int64_t bins, Scalar min, Scalar max) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::histc(const_cast<Tensor&>(*this), bins, min, max);
-            break;
-        default:
-            AT_ERROR("histc not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::histc", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, Scalar, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), bins, min, max);
-#endif
-}
-inline Tensor Tensor::fmod(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::fmod(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("fmod not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fmod", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::fmod(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::fmod(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("fmod not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::fmod", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::remainder(Scalar other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::remainder(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("remainder not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::remainder", "Scalar"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::remainder(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::remainder(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("remainder not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::remainder", "Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::min(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::min(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("min not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::min", "other"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::min() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::min(const_cast<Tensor&>(*this));
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::min(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("min not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::min", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::max(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::max(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("max not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::max", "other"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::max() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::max(const_cast<Tensor&>(*this));
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::max(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("max not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::max", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::median() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::median(const_cast<Tensor&>(*this));
-            break;
-        default:
-            AT_ERROR("median not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::median", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline std::tuple<Tensor,Tensor> Tensor::sort(int64_t dim, bool descending) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::sort(const_cast<Tensor&>(*this), dim, descending);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::sort(const_cast<Tensor&>(*this), dim, descending);
-            break;
-        default:
-            AT_ERROR("sort not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::sort", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, descending);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline std::tuple<Tensor,Tensor> Tensor::sort(Dimname dim, bool descending) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::sort(const_cast<Tensor&>(*this), dim, descending);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::sort.dimname(Tensor self, Dimname dim, bool descending=False) -> (Tensor values, Tensor indices)");
-    return table->callUnboxed<std::tuple<Tensor,Tensor>, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, descending);
-#endif
-}
-#endif
-inline Tensor Tensor::argsort(int64_t dim, bool descending) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::argsort(const_cast<Tensor&>(*this), dim, descending);
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::argsort", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, int64_t, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dim, descending);
-#endif
-}
-#ifdef BUILD_NAMEDTENSOR
-inline Tensor Tensor::argsort(Dimname dim, bool descending) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::argsort(const_cast<Tensor&>(*this), dim, descending);
-#else
-    static auto table = globalATenDispatch().getOpTable("aten::argsort.dimname(Tensor self, Dimname dim, bool descending=False) -> Tensor");
-    return table->callUnboxed<Tensor, const Tensor &, Dimname, bool>(const_cast<Tensor&>(*this), dim, descending);
-#endif
-}
-#endif
-inline std::tuple<Tensor,Tensor> Tensor::topk(int64_t k, int64_t dim, bool largest, bool sorted) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::topk(const_cast<Tensor&>(*this), k, dim, largest, sorted);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::topk(const_cast<Tensor&>(*this), k, dim, largest, sorted);
-            break;
-        default:
-            AT_ERROR("topk not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::topk", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<std::tuple<Tensor,Tensor>, const Tensor &, int64_t, int64_t, bool, bool>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), k, dim, largest, sorted);
-#endif
-}
-inline Tensor Tensor::all() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::all(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::all", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::any() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::any(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::any", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-inline Tensor Tensor::renorm(Scalar p, int64_t dim, Scalar maxnorm) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::renorm(const_cast<Tensor&>(*this), p, dim, maxnorm);
-            break;
-        default:
-            AT_ERROR("renorm not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::renorm", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, Scalar, int64_t, Scalar>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), p, dim, maxnorm);
-#endif
-}
-inline Tensor Tensor::unfold(int64_t dimension, int64_t size, int64_t step) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::unfold(const_cast<Tensor&>(*this), dimension, size, step);
-            break;
-        default:
-            AT_ERROR("unfold not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::unfold", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &, int64_t, int64_t, int64_t>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this), dimension, size, step);
-#endif
-}
-inline bool Tensor::equal(const Tensor & other) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::equal(const_cast<Tensor&>(*this), other);
-            break;
-        case Backend::QuantizedCPU:
-            return QuantizedCPUType::equal(const_cast<Tensor&>(*this), other);
-            break;
-        default:
-            AT_ERROR("equal not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::equal", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxed<bool, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, other)), const_cast<Tensor&>(*this), other);
-#endif
-}
-inline Tensor Tensor::pow(const Tensor & exponent) const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    switch(tensorTypeIdToBackend(impl::dispatchTypeId(type_set()))) {
-        case Backend::CPU:
-            return CPUType::pow(const_cast<Tensor&>(*this), exponent);
-            break;
-        default:
-            AT_ERROR("pow not implemented for ", at::toString(type_set()));
-    }
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::pow", "Tensor_Tensor"}).value();
-    return c10::Dispatcher::singleton().callUnboxed<Tensor, const Tensor &, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this, exponent)), const_cast<Tensor&>(*this), exponent);
-#endif
-}
-inline Tensor Tensor::alias() const {
-#ifdef USE_STATIC_DISPATCH
-    at::AutoNonVariableTypeMode _var_guard(true);
-    return TypeDefault::alias(const_cast<Tensor&>(*this));
-#else
-    static c10::OperatorHandle op = c10::Dispatcher::singleton().findSchema({"aten::alias", ""}).value();
-    return c10::Dispatcher::singleton().callUnboxedOnly<Tensor, const Tensor &>(
-        op, impl::dispatchTypeId(at::detail::multi_dispatch_tensor_type_set(*this)), const_cast<Tensor&>(*this));
-#endif
-}
-
-inline caffe2::TypeMeta Tensor::dtype() const noexcept {
-  return impl_->dtype();
-}
-
-inline Layout Tensor::layout() const noexcept {
-  return impl_->layout();
-}
-
-inline Device Tensor::device() const {
-  return impl_->device();
-}
-
-inline int64_t Tensor::get_device() const {
-  // NB: this is not a native function to avoid dispatching overhead.
-  return impl_->get_device();
-}
-
-inline int64_t get_device(Tensor self) {
-  return self.get_device();
-}
-
-inline bool Tensor::is_cuda() const {
-  // NB: this is not a native function to avoid dispatching overhead.
-  return impl_->is_cuda();
-}
-
-#ifdef BUILD_NAMEDTENSOR
-inline NamedTensorMeta* Tensor::get_named_tensor_meta() {
-  return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
-}
-
-inline const NamedTensorMeta* Tensor::get_named_tensor_meta() const {
-  return static_cast<NamedTensorMeta*>(impl_->named_tensor_meta());
-}
-
-inline bool Tensor::has_names() const {
-  return impl::has_names(unsafeGetTensorImpl());
-}
-#endif
-
-inline bool is_cuda(Tensor self) {
-  return self.is_cuda();
-}
-
-inline bool Tensor::is_hip() const {
-  // NB: this is not a native function to avoid dispatching overhead.
-  return impl_->is_hip();
-}
-
-inline bool is_hip(Tensor self) {
-  return self.is_hip();
-}
-
-inline bool Tensor::is_sparse() const {
-  // NB: this is not a native function to avoid dispatching overhead.
-  return impl_->is_sparse();
-}
-
-inline bool is_sparse(Tensor self) {
-  return self.is_sparse();
-}
-
-inline bool Tensor::is_mkldnn() const {
-  // NB: this is not a native function to avoid dispatching overhead.
-  return impl_->is_mkldnn();
-}
-
-inline bool is_mkldnn(Tensor self) {
-  return self.is_mkldnn();
-}
-
-inline bool Tensor::is_quantized() const {
-  // NB: this is not a native function to avoid dispatching overhead.
-  return impl_->is_quantized();
-}
-
-inline bool is_quantized(Tensor self) {
-  return self.is_quantized();
-}
-
-#define DEFINE_CAST(T, name)                     \
-  template <>                                    \
-  inline T* Tensor::data_ptr() const {           \
-    TORCH_CHECK(                                 \
-        scalar_type() == ScalarType::name,       \
-        "expected scalar type ",                 \
-        #name,                                   \
-        " but found ",                           \
-        c10::toString(scalar_type()));           \
-    return static_cast<T*>(this->unsafeGetTensorImpl()->data());    \
-  }
-
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_CAST)
-AT_FORALL_QINT_TYPES(DEFINE_CAST)
-#undef DEFINE_CAST
-
-#define DEFINE_ITEM(T, name)      \
-  template <>                     \
-  inline T Tensor::item() const { \
-    return item().to##name();     \
-  }
-
-AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(DEFINE_ITEM)
-#undef DEFINE_ITEM
-
-} //namespace at
diff --git a/aten/src/ATen/core/context_base.cpp b/aten/src/ATen/core/context_base.cpp
deleted file mode 100644
index f91764031f84d..0000000000000
--- a/aten/src/ATen/core/context_base.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <ATen/core/context_base.h>
-
-#include <c10/util/Logging.h>
-
-namespace at {
-
-C10_DEFINE_TYPED_REGISTRY(
-    ContextRegistry,
-    at::DeviceType,
-    at::BaseContext,
-    std::unique_ptr,
-    at::Device);
-
-} // namespace at
-
-namespace caffe2 {
-
-// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
-
-} // namespace caffe2
diff --git a/aten/src/ATen/core/context_base.h b/aten/src/ATen/core/context_base.h
deleted file mode 100644
index 4ea513857dc78..0000000000000
--- a/aten/src/ATen/core/context_base.h
+++ /dev/null
@@ -1,164 +0,0 @@
-#pragma once
-
-#include <array>
-#include <cstdlib>
-#include <ctime>
-#include <memory>
-#include <unordered_map>
-
-#include <ATen/core/ATenGeneral.h>
-#include <c10/core/Allocator.h>
-#include <c10/util/typeid.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Registry.h>
-#include <c10/core/CopyBytes.h>
-
-namespace caffe2 {
-class Event;
-
-} // namespace caffe2
-namespace at {
-
-class BaseContext;
-
-/**
- * Virtual interface for the Context class in Caffe2.
- *
- * A Context defines all the necessities to run an operator on a specific
- * device. Specific Context classes needs to implement all the pure virtual
- * functions in the BaseContext class.
- * TODO: add docs after this is finalized.
- */
-class CAFFE2_API BaseContext {
- public:
-  virtual ~BaseContext() noexcept {}
-
-  virtual Device device() const = 0;
-
-  /* Sorry for the naming, will get rid of this in future diff */
-  virtual DeviceType device_type() const = 0;
-
-  virtual void SwitchToDevice(int /*stream_id*/) = 0;
-
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
-
-  virtual void WaitEvent(const caffe2::Event& ev) = 0;
-
-  virtual void Record(caffe2::Event* ev, const char* err_msg = nullptr)
-      const = 0;
-
-  virtual void FinishDeviceComputation() = 0;
-
-  // This used to be arbitrary cross-device copy, but it turns out everyone
-  // did direct CPU-X copy, so we just make three functions for it (to avoid
-  // double dispatch).  This will get obsoleted by C10. where copies
-  // will be proper operators (and get to rely on multiple dispatch there.)
-  virtual void CopyBytesSameDevice(
-      size_t nbytes,
-      const void* src,
-      void* dst) = 0;
-
-  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
-
-  template <typename T>
-  inline void CopySameDevice(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value,
-        "CopySameDevice requires fundamental types");
-    CopyBytesSameDevice(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value,
-        "CopyFromCPU requires fundamental types");
-    CopyBytesFromCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  template <typename T>
-  inline void CopyToCPU(size_t n, const T* src, T* dst) {
-    static_assert(
-        std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
-    CopyBytesToCPU(
-        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
-  }
-
-  virtual bool SupportsNonFundamentalTypes() const {
-    return false;
-  }
-
-  inline void EnforceMetaCopyOK() {
-    AT_ASSERTM(
-        SupportsNonFundamentalTypes(), "Context requires fundamental types");
-  }
-
-  void CopyItemsSameDevice(
-      const caffe2::TypeMeta& meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  void CopyItemsFromCPU(
-      const caffe2::TypeMeta& meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-
-  void CopyItemsToCPU(
-      const caffe2::TypeMeta& meta,
-      size_t n,
-      const void* src,
-      void* dst) {
-    if (meta.copy()) {
-      EnforceMetaCopyOK();
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytesToCPU(n * meta.itemsize(), src, dst);
-    }
-  }
-};
-
-// Context constructor registry
-C10_DECLARE_TYPED_REGISTRY(
-    ContextRegistry,
-    at::DeviceType,
-    at::BaseContext,
-    std::unique_ptr,
-    at::Device);
-
-#define REGISTER_CONTEXT(type, ...) \
-  C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
-
-inline std::unique_ptr<at::BaseContext> CreateContext(
-    const at::Device& device) {
-  return at::ContextRegistry()->Create(device.type(), device);
-}
-
-} // namespace at
-
-namespace caffe2 {
-
-using at::BaseContext;
-using at::CreateContext;
-} // namespace caffe2
diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index c3ee9547133e4..b87d4292b5d41 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -1286,9 +1286,7 @@ def gen_namespace_function(option, multidispatch_tensors):
             return FunctionCode(definition=fn_definition, declaration=fn_declaration)
 
         # Emit #ifdef BUILD_NAMEDTENSOR macros for any code generated here
-        # that is sent to top_env. This is because some of this code (Type.h,
-        # TensorBody.h, TensorMethods.h) is checked into the repo and must be
-        # the same regardless of BUILD_NAMEDTENSOR status.
+        # that is sent to top_env.
         is_named_tensor_only = (has_named_tensor_formals(formals) or
                                 option['api_name'] == 'align_tensors' or
                                 option['api_name'] == 'align_as')
diff --git a/aten/src/ATen/gen.py b/aten/src/ATen/gen.py
index cff113ebb0211..f99e5828c03ae 100644
--- a/aten/src/ATen/gen.py
+++ b/aten/src/ATen/gen.py
@@ -45,12 +45,17 @@
     action='store_true',
     help='reinterpret CUDA as ROCm/HIP and adjust filepaths accordingly')
 options = parser.parse_args()
-gen_to_source = os.environ.get('GEN_TO_SOURCE')  # update source directly as part of gen
-if not gen_to_source:
-    core_install_dir = os.path.join(options.install_dir, 'core_tmp') if options.install_dir is not None else None
-else:
-    core_install_dir = os.path.join(options.source_path, 'core')
-
+# NB: It is mandatory to NOT use os.path.join here, as the install directory
+# will eventually be ingested by cmake, which does not respect Windows style
+# path slashes.  If you switch this to use os.path.join, you'll get an error
+# like:
+#
+#   Syntax error in cmake code when parsing string
+#
+#     C:/Jenkins/workspace/pytorch-builds/pytorch-win-ws2016-cuda9-cudnn7-py3-build/build/aten/src/ATen\core/TensorMethods.h
+#
+#   Invalid character escape '\c'.
+core_install_dir = options.install_dir + '/core' if options.install_dir is not None else None
 if options.install_dir is not None and not os.path.exists(options.install_dir):
     os.makedirs(options.install_dir)
 if core_install_dir is not None and not os.path.exists(core_install_dir):
@@ -327,7 +332,7 @@ def iterate_types():
 # so that the script runs quickly when we are just querying the
 # outputs
 def declare_outputs():
-    core_files = ['TensorBody.h', 'TensorMethods.h']
+    core_files = ['TensorBody.h', 'TensorMethods.h', 'OpsAlreadyMovedToC10.cpp']
     for f in core_files:
         core_file_manager.will_write(f)
     files = ['Declarations.yaml', 'TypeDefault.cpp', 'TypeDefault.h',
@@ -359,36 +364,6 @@ def filter_by_extension(files, *extensions):
     return filtered_files
 
 
-# because EOL may not be LF(\n) on some environment (e.g. Windows),
-# normalize EOL from CRLF/CR to LF and compare both files.
-def cmpfiles_with_eol_normalization(a, b, names):
-    results = ([], [], [])    # match, mismatch, error
-    for x in names:
-        try:
-            with open(os.path.join(a, x)) as f:
-                ax = f.read().replace('\r\n', '\n').replace('\r', '\n')
-            with open(os.path.join(b, x)) as f:
-                bx = f.read().replace('\r\n', '\n').replace('\r', '\n')
-            if ax == bx:
-                results[0].append(x)
-            else:
-                results[1].append(x)
-                import difflib
-                import sys
-                d = difflib.Differ()
-                sys.stdout.write('-' * 80 + '\n')
-                sys.stdout.write('x={}, a={}, b={}\n'.format(x, a, b))
-                for i, line in enumerate(list(d.compare(ax.splitlines(), bx.splitlines()))):
-                    if line[:2] != '  ':
-                        sys.stdout.write('{:5d}: {}\n'.format(i, line))
-                sys.stdout.write('-' * 80 + '\n')
-                sys.stdout.write(ax)
-                sys.stdout.write('-' * 80 + '\n')
-        except OSError:
-            results[2].append(x)
-    return results
-
-
 def is_namedtensor_only_decl(decl):
     if 'Dimname' in decl['schema_string']:
         return True
@@ -448,21 +423,6 @@ def generate_outputs():
     file_manager.check_all_files_written()
     cuda_file_manager.check_all_files_written()
 
-    # check that generated files match source files
-    core_source_path = os.path.join(options.source_path, 'core')
-    match, mismatch, errors = cmpfiles_with_eol_normalization(core_install_dir, core_source_path, core_files.keys())
-    if errors:
-        raise RuntimeError("Error while trying to compare source and generated files for {}. "
-                           "Source directory: {}.  Generated directory: {}."
-                           .format(errors, core_source_path, core_install_dir))
-    if mismatch:
-        file_component = '{}'.format(','.join(mismatch))
-        if len(mismatch) > 1:
-            file_component = '{' + file_component + '}'
-        update_cmd = "cp {}/{} {}".format(core_install_dir, file_component, core_source_path)
-        raise RuntimeError("Source files: {} did not match generated files.  To update the source files, "
-                           "set environment variable GEN_TO_SOURCE or run \"{}\"".format(mismatch, update_cmd))
-
 declare_outputs()
 if options.output_dependencies is not None:
     file_manager.write_outputs(options.output_dependencies)
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index b2b8e9358fb54..e0c1b1222dccf 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -44,14 +44,6 @@ if (INTERN_BUILD_ATEN_OPS)
   list(APPEND Caffe2_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS})
   list(APPEND Caffe2_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS})
   list(APPEND Caffe2_DEPENDENCY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE})
-else()
-  # Only add "ATen Core", a minimal, easy-to-compile fragment of ATen.
-  # This codepath should only be exercised by the Android build.
-  add_subdirectory(../aten/src/ATen/core ATen_core)
-  list(APPEND Caffe2_CPU_SRCS ${ATen_CORE_SRCS})
-  list(APPEND Caffe2_CPU_INCLUDE ${ATen_CORE_INCLUDE})
-  list(APPEND Caffe2_CPU_TEST_SRCS ${ATen_CORE_TEST_SRCS})
-  # See cmake/Codegen.cmake for header installation
 endif()
 
 # ---[ Caffe2 build
diff --git a/caffe2/c2_aten_srcs.bzl b/caffe2/c2_aten_srcs.bzl
index 8e9dd16de21b9..53cb3553ae631 100644
--- a/caffe2/c2_aten_srcs.bzl
+++ b/caffe2/c2_aten_srcs.bzl
@@ -2,13 +2,11 @@ ATEN_CORE_HEADER_FILES = [
     # "aten/src/" prefix is added later
     "ATen/core/ATenGeneral.h",
     "ATen/core/blob.h",
-    "ATen/core/context_base.h",
     "ATen/core/DimVector.h",
     "ATen/core/grad_mode.h",
     "ATen/core/UndefinedTensorImpl.h",
 ]
 
 ATEN_CORE_SRC_FILES = [
-    "aten/src/ATen/core/context_base.cpp",
     "aten/src/ATen/core/grad_mode.cpp",
 ]
diff --git a/caffe2/core/context_base.cc b/caffe2/core/context_base.cc
index 99996d9e165b9..35a792ef86908 100644
--- a/caffe2/core/context_base.cc
+++ b/caffe2/core/context_base.cc
@@ -1,5 +1,20 @@
-#include "context_base.h"
+#include <caffe2/core/context_base.h>
+
+#include <c10/util/Logging.h>
+
+namespace at {
+
+C10_DEFINE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+} // namespace at
 
 namespace caffe2 {
 
+// TODO: rename context.h -> context_cpu.h & context_base.h -> context.h
+
 } // namespace caffe2
diff --git a/caffe2/core/context_base.h b/caffe2/core/context_base.h
index 3a6dfad5b95cc..3ba0d522b5f84 100644
--- a/caffe2/core/context_base.h
+++ b/caffe2/core/context_base.h
@@ -1,7 +1,168 @@
 #pragma once
 
-#include <ATen/core/context_base.h>
-// For CaffeMap
+#include <array>
+#include <cstdlib>
+#include <ctime>
+#include <memory>
+#include <unordered_map>
+
+#include <c10/macros/Macros.h>
+#include <c10/core/Allocator.h>
+#include <c10/util/typeid.h>
+#include <c10/util/Exception.h>
+#include <c10/util/Registry.h>
+#include <c10/core/CopyBytes.h>
+
 #include "caffe2/core/common.h"
 #include "caffe2/core/logging.h"
 #include "caffe2/proto/caffe2_pb.h"
+
+namespace caffe2 {
+class Event;
+
+} // namespace caffe2
+namespace at {
+
+class BaseContext;
+
+/**
+ * Virtual interface for the Context class in Caffe2.
+ *
+ * A Context defines all the necessities to run an operator on a specific
+ * device. Specific Context classes needs to implement all the pure virtual
+ * functions in the BaseContext class.
+ * TODO: add docs after this is finalized.
+ */
+class CAFFE2_API BaseContext {
+ public:
+  virtual ~BaseContext() noexcept {}
+
+  virtual Device device() const = 0;
+
+  /* Sorry for the naming, will get rid of this in future diff */
+  virtual DeviceType device_type() const = 0;
+
+  virtual void SwitchToDevice(int /*stream_id*/) = 0;
+
+  inline void SwitchToDevice() {
+    SwitchToDevice(0);
+  }
+
+  virtual void WaitEvent(const caffe2::Event& ev) = 0;
+
+  virtual void Record(caffe2::Event* ev, const char* err_msg = nullptr)
+      const = 0;
+
+  virtual void FinishDeviceComputation() = 0;
+
+  // This used to be arbitrary cross-device copy, but it turns out everyone
+  // did direct CPU-X copy, so we just make three functions for it (to avoid
+  // double dispatch).  This will get obsoleted by C10. where copies
+  // will be proper operators (and get to rely on multiple dispatch there.)
+  virtual void CopyBytesSameDevice(
+      size_t nbytes,
+      const void* src,
+      void* dst) = 0;
+
+  virtual void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  virtual void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) = 0;
+
+  template <typename T>
+  inline void CopySameDevice(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopySameDevice requires fundamental types");
+    CopyBytesSameDevice(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyFromCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value,
+        "CopyFromCPU requires fundamental types");
+    CopyBytesFromCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  template <typename T>
+  inline void CopyToCPU(size_t n, const T* src, T* dst) {
+    static_assert(
+        std::is_fundamental<T>::value, "CopyToCPU requires fundamental types");
+    CopyBytesToCPU(
+        n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
+  }
+
+  virtual bool SupportsNonFundamentalTypes() const {
+    return false;
+  }
+
+  inline void EnforceMetaCopyOK() {
+    AT_ASSERTM(
+        SupportsNonFundamentalTypes(), "Context requires fundamental types");
+  }
+
+  void CopyItemsSameDevice(
+      const caffe2::TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesSameDevice(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  void CopyItemsFromCPU(
+      const caffe2::TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesFromCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+
+  void CopyItemsToCPU(
+      const caffe2::TypeMeta& meta,
+      size_t n,
+      const void* src,
+      void* dst) {
+    if (meta.copy()) {
+      EnforceMetaCopyOK();
+      meta.copy()(src, dst, n);
+    } else {
+      CopyBytesToCPU(n * meta.itemsize(), src, dst);
+    }
+  }
+};
+
+// Context constructor registry
+C10_DECLARE_TYPED_REGISTRY(
+    ContextRegistry,
+    at::DeviceType,
+    at::BaseContext,
+    std::unique_ptr,
+    at::Device);
+
+#define REGISTER_CONTEXT(type, ...) \
+  C10_REGISTER_TYPED_CLASS(ContextRegistry, type, __VA_ARGS__)
+
+inline std::unique_ptr<at::BaseContext> CreateContext(
+    const at::Device& device) {
+  return at::ContextRegistry()->Create(device.type(), device);
+}
+
+} // namespace at
+
+namespace caffe2 {
+
+using at::BaseContext;
+using at::CreateContext;
+} // namespace caffe2
diff --git a/caffe2/core/export_c10_op_to_caffe2.h b/caffe2/core/export_c10_op_to_caffe2.h
index b9935a77a3d6b..80c1ff8a7b07d 100644
--- a/caffe2/core/export_c10_op_to_caffe2.h
+++ b/caffe2/core/export_c10_op_to_caffe2.h
@@ -1,13 +1,16 @@
 #pragma once
 
+#include <c10/macros/Macros.h>
+#include <c10/util/Registry.h>
+#include "caffe2/core/operator.h"
+
 // TODO Also register c10 operators on mobile
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/ivalue.h>
 #include <c10/util/ArrayRef.h>
 #include <c10/util/C++17.h>
 #include <c10/util/Metaprogramming.h>
-#include "caffe2/core/operator.h"
 #include "caffe2/core/export_caffe2_op_to_c10.h"
 
 namespace caffe2 {
diff --git a/caffe2/core/export_caffe2_op_to_c10.h b/caffe2/core/export_caffe2_op_to_c10.h
index 84a5f70fb4ef2..11db8bb51218f 100644
--- a/caffe2/core/export_caffe2_op_to_c10.h
+++ b/caffe2/core/export_caffe2_op_to_c10.h
@@ -1,6 +1,8 @@
 #pragma once
 
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#include <c10/macros/Macros.h>
+
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 #include <ATen/core/function_schema.h>
 #include <ATen/core/grad_mode.h>
 #include <ATen/core/op_registration/op_registration.h>
@@ -93,7 +95,7 @@ void call_caffe2_op_from_c10(
 }
 
 inline FunctionSchema make_function_schema_for_c10(const char* schema_str) {
-#if defined(CAFFE2_IS_XPLAT_BUILD)
+#if defined(CAFFE2_IS_XPLAT_BUILD) || defined(C10_MOBILE)
   throw std::logic_error("We don't support registering c10 ops on mobile yet because the function schema parser isn't present in the mobile build.");
 #else
   c10::FunctionSchema parsed_schema = torch::jit::parseSchema(schema_str);
diff --git a/caffe2/core/net_async_base.h b/caffe2/core/net_async_base.h
index 01eca67ddc77e..a2304d89fada2 100644
--- a/caffe2/core/net_async_base.h
+++ b/caffe2/core/net_async_base.h
@@ -1,6 +1,7 @@
 #ifndef CAFFE2_CORE_NET_ASYNC_BASE_H_
 #define CAFFE2_CORE_NET_ASYNC_BASE_H_
 
+#include <c10/macros/Macros.h>
 #include "c10/core/thread_pool.h"
 #include "c10/util/Registry.h"
 #include "caffe2/core/common.h"
@@ -13,7 +14,6 @@
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/proto/prof_dag.pb.h"
 #include "caffe2/utils/proto_utils.h"
-#include <ATen/core/Tensor.h>
 
 C10_DECLARE_int(caffe2_streams_per_gpu);
 C10_DECLARE_int(caffe2_net_async_max_gpus);
diff --git a/caffe2/core/operator.cc b/caffe2/core/operator.cc
index 225b01d1034d9..cb98352a938da 100644
--- a/caffe2/core/operator.cc
+++ b/caffe2/core/operator.cc
@@ -14,7 +14,7 @@
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/proto_utils.h"
 #include "caffe2/utils/string_utils.h"
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 #include <ATen/core/List.h>
 #endif
 
@@ -58,7 +58,7 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
       device_option_(
           operator_def.has_device_option() ? operator_def.device_option()
                                            : DeviceOption()),
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
       newstyle_outputs_(),
 #endif
       input_size_(operator_def.input_size()),
@@ -86,7 +86,7 @@ OperatorBase::OperatorBase(const OperatorDef& operator_def, Workspace* ws)
   type_ = operator_def.type();
 }
 
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 namespace {
 int
 C10_UNUSED  // Suppress unused function warning on mobile.
@@ -793,7 +793,7 @@ std::function<void(const OperatorDef&)> GetOperatorLogger() {
 
 c10::optional<int> OperatorBase::argumentIndexWithName(
     const std::string& name) const {
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   return getFunctionSchema().argumentIndexWithName(name);
 #else
   CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
diff --git a/caffe2/core/operator.h b/caffe2/core/operator.h
index 4296209f512c8..a60149c40d9a7 100644
--- a/caffe2/core/operator.h
+++ b/caffe2/core/operator.h
@@ -28,7 +28,7 @@
 #include "caffe2/proto/caffe2_pb.h"
 #include "caffe2/utils/proto_utils.h"
 
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 #include <ATen/core/Tensor.h>
 #include <ATen/core/ivalue.h>
 #endif
@@ -59,7 +59,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
    * Alternatively, inputs can be one tensor list ivalue followed by non-tensors
    * to represent operators with a variable number of inputs.
    */
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   explicit OperatorBase(
       const c10::FunctionSchema& schema,
       std::vector<c10::IValue> inputs,
@@ -72,7 +72,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
    * New operators should be instantiated with FunctionSchema
    */
   bool isLegacyOperator() const {
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     return !fn_schema_;
 #else
     return true;
@@ -81,7 +81,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
 
   const c10::FunctionSchema& getFunctionSchema() const {
     CAFFE_ENFORCE(!isLegacyOperator());
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     return *fn_schema_.get();
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
@@ -107,7 +107,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
       return ArgumentHelper::GetSingleArgument<OperatorDef, T>(
           *operator_def_, name, default_value);
     }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     auto index = argumentIndexWithName(name);
     CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
     const auto& value = newstyle_inputs_[index.value()];
@@ -123,7 +123,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     return ArgumentHelper::HasSingleArgumentOfType<OperatorDef, T>(
         *operator_def_, name);
   }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   template <typename T>
   inline vector<T> GetVectorFromIValueList(const c10::IValue& value) const {
     return c10::impl::toVector(value.template to<List<T>>());
@@ -180,7 +180,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
         throw enf;
       }
     }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     DCHECK_LT(0, newstyle_inputs_.size());
     IValue ival;
     if (newstyle_inputs_[0].isTensorList()) {
@@ -230,7 +230,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
       // When you get a Tensor here it is not fully initialized
       return BlobGetMutableTensor(outputs_.at(idx), type);
     }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     at::Tensor output = newstyle_outputs_[idx];
     Tensor tensor = caffe2::Tensor(output);
     if (!tensor.defined() || tensor.GetDeviceType() != type) {
@@ -260,7 +260,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
 
   void SetOutputTensor(int idx, Tensor tensor) {
     if (!isLegacyOperator()) {
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
       newstyle_outputs_[idx] = at::Tensor(tensor);
 
       // also update the tensor in the hack
@@ -289,7 +289,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
           "device must be provided in options.");
       return BlobGetMutableTensor(outputs_.at(idx), dims, options);
     }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     at::Tensor output = newstyle_outputs_[idx];
     Tensor tensor =
         GetSizedTensorWithOptions(caffe2::Tensor(output), dims, options);
@@ -413,7 +413,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     if (isLegacyOperator()) {
       return outputs_.size();
     }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     return newstyle_outputs_.size();
 #else
     CAFFE_THROW("Non-legacy operators are not legal in xplat/caffe2");
@@ -628,7 +628,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
     return helper_;
   }
 
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   c10::List<at::Tensor> move_newstyle_outputs() && {
     return std::move(newstyle_outputs_);
   }
@@ -646,7 +646,7 @@ class CAFFE2_API OperatorBase : public Observable<OperatorBase> {
   vector<const Blob*> inputs_;
   vector<Blob*> outputs_;
   // Preferrably use c10::optional, but nvcc doesn't work
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   std::unique_ptr<const c10::FunctionSchema> fn_schema_;
   vector<c10::IValue> newstyle_inputs_;
   c10::List<at::Tensor> newstyle_outputs_;
@@ -710,7 +710,7 @@ inline NetDef OperatorBase::GetSingleArgument<NetDef>(
   return NetDef();
 }
 
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 template <>
 inline vector<int> OperatorBase::GetVectorFromIValueList<int>(
     const c10::IValue& value) const {
@@ -794,7 +794,7 @@ inline vector<T> OperatorBase::GetRepeatedArgument(
     return ArgumentHelper::GetRepeatedArgument<OperatorDef, T>(
         *operator_def_, name, default_value);
   }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   auto index = argumentIndexWithName(name);
   CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
   const auto& value = newstyle_inputs_[index.value()];
@@ -815,7 +815,7 @@ inline vector<int16_t> OperatorBase::GetRepeatedArgument<int16_t>(
     return ArgumentHelper::GetRepeatedArgument<OperatorDef, int16_t>(
         *operator_def_, name, default_value);
   }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   auto index = argumentIndexWithName(name);
   CAFFE_ENFORCE(index.has_value(), "Couldn't get index for argument!", name);
   const auto& value = newstyle_inputs_[index.value()];
@@ -843,7 +843,7 @@ class Operator : public OperatorBase {
     // constructors will run on that device.
     context_.SwitchToDevice();
   }
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   explicit Operator(
       const c10::FunctionSchema& fn_schema,
       std::vector<c10::IValue> inputs,
diff --git a/caffe2/core/tensor.cc b/caffe2/core/tensor.cc
index afdc44424ee31..9fe81027ad93e 100644
--- a/caffe2/core/tensor.cc
+++ b/caffe2/core/tensor.cc
@@ -202,9 +202,11 @@ void Tensor::enforce_invariants() {
     throw std::runtime_error("TensorImpl with nullptr is not supported");
   }
   // TODO: only check `!impl_->requires_grad()` after Variable and Tensor are merged
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   CAFFE_ENFORCE(
     !impl_->is_variable() || !(impl_->requires_grad() && at::GradMode::is_enabled()),
     "Caffe2 tensor wrapper doesn't support autograd variables that require grad");
+#endif
   CAFFE_ENFORCE_EQ(
       impl_->layout(),
       at::kStrided,
diff --git a/caffe2/core/tensor.h b/caffe2/core/tensor.h
index 41b2b421c4051..1f1ab8d06e56d 100644
--- a/caffe2/core/tensor.h
+++ b/caffe2/core/tensor.h
@@ -1,18 +1,18 @@
 #ifndef CAFFE2_CORE_TENSOR_H_
 #define CAFFE2_CORE_TENSOR_H_
 
+#include <c10/macros/Macros.h>
 #include "caffe2/core/storage.h"
 #include "caffe2/core/tensor_impl.h"
 
 #include <ATen/core/UndefinedTensorImpl.h>
 #include <c10/util/intrusive_ptr.h>
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
 #include "ATen/core/Tensor.h"
+#include <ATen/core/grad_mode.h>
 #endif
 #include <c10/core/TensorOptions.h>
 
-#include <ATen/core/grad_mode.h>
-
 namespace caffe2 {
 
 using at::UndefinedTensorImpl;
@@ -119,7 +119,7 @@ class CAFFE2_API Tensor final {
    * The tensor will share the same instance (data, strides, sizes, etc) but
    * a different subset of APIs would be available
    */
-#if !defined(CAFFE2_IS_XPLAT_BUILD)
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   explicit Tensor(at::Tensor tensor)
       : impl_(std::move(tensor.impl_)) {
     enforce_invariants();
@@ -196,7 +196,9 @@ class CAFFE2_API Tensor final {
    */
   void CopyFrom(const Tensor& src, bool async = false) {
     // TODO: only check `!impl_->requires_grad()` after Variable and Tensor are merged
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
     AT_ASSERT(!impl_->is_variable() || !(impl_->requires_grad() && at::GradMode::is_enabled()));
+#endif
     AT_ASSERTM(
         src.impl_->is_contiguous(),
         "Right now only copy of contiguous source Tensor is supported.");
diff --git a/caffe2/core/tensor_impl.h b/caffe2/core/tensor_impl.h
index 64f6db326e315..6c5316da83d89 100644
--- a/caffe2/core/tensor_impl.h
+++ b/caffe2/core/tensor_impl.h
@@ -2,8 +2,8 @@
 
 #include <ATen/core/DimVector.h>
 #include <c10/core/TensorImpl.h>
-#include <ATen/core/context_base.h>
-#include <ATen/core/context_base.h>
+#include <caffe2/core/context_base.h>
+#include <caffe2/core/context_base.h>
 
 namespace caffe2 {
   using at::ToVectorint64_t;
diff --git a/caffe2/operators/CMakeLists.txt b/caffe2/operators/CMakeLists.txt
index ecdb951fcdaf7..0b3284f42fd0f 100644
--- a/caffe2/operators/CMakeLists.txt
+++ b/caffe2/operators/CMakeLists.txt
@@ -42,23 +42,25 @@ exclude(tmp "${tmp}" ${tmp_cudnn})
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
 
 # Add all files in experimental
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/flatten_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/averaged_loss_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/mul_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/relu_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/expand_dims_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/filler_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/sparse_lengths_sum_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/sigmoid_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/cast_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/stop_gradient_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/batch_gather_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/concat_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/batch_matmul_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/fc_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/enforce_finite_cpu.cc)
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/add_cpu.cc)
+if (NOT INTERN_BUILD_MOBILE AND NOT BUILD_CAFFE2_MOBILE)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/flatten_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/averaged_loss_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/mul_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/relu_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/expand_dims_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/filler_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/sparse_lengths_sum_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/sigmoid_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/cast_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/stop_gradient_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/batch_gather_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/concat_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/batch_matmul_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/sigmoid_cross_entropy_with_logits_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/fc_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/enforce_finite_cpu.cc)
+    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${CMAKE_CURRENT_LIST_DIR}/experimental/c10/cpu/add_cpu.cc)
+endif()
 
 set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
 # exclude test files and gpu files
diff --git a/caffe2/operators/reduction_ops.h b/caffe2/operators/reduction_ops.h
index b6ca336f2a56f..d4fbef0359b45 100644
--- a/caffe2/operators/reduction_ops.h
+++ b/caffe2/operators/reduction_ops.h
@@ -19,11 +19,13 @@ class SumElementsOp : public Operator<Context> {
         average_(this->template GetSingleArgument<bool>("average", false)) {}
   explicit SumElementsOp(const OperatorDef& operator_def, Workspace* ws, bool average)
       : Operator<Context>(operator_def, ws), average_(average) {}
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   explicit SumElementsOp(const c10::FunctionSchema& schema, std::vector<c10::IValue> inputs, std::vector<c10::IValue*> outputs)
       : Operator<Context>(schema, std::move(inputs), std::move(outputs)),
         average_(this->template GetSingleArgument<bool>("average", false)) {}
   explicit SumElementsOp(const c10::FunctionSchema& schema, std::vector<c10::IValue> inputs, std::vector<c10::IValue*> outputs, bool average)
       : Operator<Context>(schema, std::move(inputs), std::move(outputs)), average_(average) {}
+#endif
   ~SumElementsOp() {}
 
   bool RunOnDevice() override {
@@ -85,11 +87,13 @@ class SumElementsGradientOp : public Operator<Context> {
         average_(this->template GetSingleArgument<bool>("average", false)) {}
   explicit SumElementsGradientOp(const OperatorDef& operator_def, Workspace* ws, bool average)
       : Operator<Context>(operator_def, ws), average_(average) {}
+#if !defined(CAFFE2_IS_XPLAT_BUILD) && !defined(C10_MOBILE)
   explicit SumElementsGradientOp(const c10::FunctionSchema& schema, std::vector<c10::IValue> inputs, std::vector<c10::IValue*> outputs)
       : Operator<Context>(schema, std::move(inputs), std::move(outputs)),
         average_(this->template GetSingleArgument<bool>("average", false)) {}
   explicit SumElementsGradientOp(const c10::FunctionSchema& schema, std::vector<c10::IValue> inputs, std::vector<c10::IValue*> outputs, bool average)
       : Operator<Context>(schema, std::move(inputs), std::move(outputs)), average_(average) {}
+#endif
   ~SumElementsGradientOp() {}
 
   bool RunOnDevice() override;
diff --git a/cmake/Codegen.cmake b/cmake/Codegen.cmake
index 387304cf74679..f8fdcc8be71e6 100644
--- a/cmake/Codegen.cmake
+++ b/cmake/Codegen.cmake
@@ -171,29 +171,24 @@ if (INTERN_BUILD_ATEN_OPS)
       message(STATUS ${generated_cpp})
       message(FATAL_ERROR "Failed to get generated_cpp list")
   endif()
+  # FIXME: the file/variable name lists cpp, but these list both cpp and .h files
   file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt generated_cpp)
   file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-cuda cuda_generated_cpp)
+  file(READ ${CMAKE_BINARY_DIR}/aten/src/ATen/generated_cpp.txt-core core_generated_cpp)
 
   file(GLOB_RECURSE all_templates "${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/templates/*")
 
-  # these are files that are generated by the script and checked in -- the script checks
-  # that they are equivalent so it must be a dependency of the script
-  set(core_gen_checked_inputs
-    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/TensorBody.h
-    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/TensorMethods.h
-    ${CMAKE_CURRENT_LIST_DIR}/../aten/src/ATen/core/OpsAlreadyMovedToC10.cpp)
-
   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen)
-  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core_tmp)
+  file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/aten/src/ATen/core)
 
-  add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp}
+  add_custom_command(OUTPUT ${generated_cpp} ${cuda_generated_cpp} ${core_generated_cpp}
     COMMAND ${GEN_COMMAND}
-    DEPENDS ${all_python} ${all_templates} ${cwrap_files} ${core_gen_checked_inputs})
+    DEPENDS ${all_python} ${all_templates} ${cwrap_files})
 
   # Generated headers used from a CUDA (.cu) file are
   # not tracked correctly in CMake. We make the libATen.so depend explicitly
   # on building the generated ATen files to workaround.
-  add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS ${generated_cpp})
+  add_custom_target(ATEN_CPU_FILES_GEN_TARGET DEPENDS ${generated_cpp} ${core_generated_cpp})
   add_custom_target(ATEN_CUDA_FILES_GEN_TARGET DEPENDS ${cuda_generated_cpp})
   add_library(ATEN_CPU_FILES_GEN_LIB INTERFACE)
   add_library(ATEN_CUDA_FILES_GEN_LIB INTERFACE)
diff --git a/docs/cpp/source/Doxyfile b/docs/cpp/source/Doxyfile
index 8290c9c71d1b1..19661452f0624 100644
--- a/docs/cpp/source/Doxyfile
+++ b/docs/cpp/source/Doxyfile
@@ -33,7 +33,6 @@ INPUT                  = ../../../aten/src/ATen/ATen.h \
                          ../../../aten/src/ATen/Backend.h \
                          ../../../aten/src/ATen/core/ivalue.h \
                          ../../../aten/src/ATen/core/ScalarType.h \
-                         ../../../aten/src/ATen/core/Tensor.h \
                          ../../../aten/src/ATen/cuda/CUDAContext.h \
                          ../../../aten/src/ATen/cudnn/Descriptors.h \
                          ../../../aten/src/ATen/cudnn/Handles.h \
@@ -44,6 +43,7 @@ INPUT                  = ../../../aten/src/ATen/ATen.h \
                          ../../../aten/src/ATen/mkl/Descriptors.h \
                          ../../../aten/src/ATen/Scalar.h \
                          ../../../aten/src/ATen/TensorOptions.h \
+                         ../../../aten/src/ATen/core/Tensor.h \
                          ../../../build/aten/src/ATen/Functions.h \
                          ../../../c10/core/Device.h \
                          ../../../c10/core/DeviceType.h \

From 0222eceaaafc94859c86de30e65856863603db23 Mon Sep 17 00:00:00 2001
From: vishwakftw <vishwaks@cs.cmu.edu>
Date: Sun, 6 Oct 2019 23:26:55 -0700
Subject: [PATCH 090/113] Remove outdated note in cholesky_solve and
 triangular_solve doc strings (#26989)

Summary:
We do support inputs with dim > 2 in _out variants
Pull Request resolved: https://github.com/pytorch/pytorch/pull/26989

Differential Revision: D17785632

Pulled By: soumith

fbshipit-source-id: d42ba7ca9c225ad1a26ff3b410d0c5c08eaed001
---
 torch/_torch_docs.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index edf90bf377054..6cf93b949c8c0 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -1055,11 +1055,6 @@ def merge_dicts(*dicts):
 batches of 2D matrices. If the inputs are batches, then returns
 batched outputs `c`
 
-.. note::
-
-    The :attr:`out` keyword only supports 2D matrix inputs, that is,
-    `b, u` must be 2D matrices.
-
 Args:
     input (Tensor): input matrix :math:`b` of size :math:`(*, m, k)`,
                 where :math:`*` is zero or more batch dimensions
@@ -5486,11 +5481,6 @@ def merge_dicts(*dicts):
 batches of 2D matrices. If the inputs are batches, then returns
 batched outputs `X`
 
-.. note::
-
-    The :attr:`out` keyword only supports 2D matrix inputs, that is,
-    `b, A` must be 2D matrices.
-
 Args:
     input (Tensor): multiple right-hand sides of size :math:`(*, m, k)` where
                 :math:`*` is zero of more batch dimensions (:math:`b`)

From 1250acef90a3387febb2d70ec643038270c6ef84 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Mon, 7 Oct 2019 11:26:43 -0700
Subject: [PATCH 091/113] Disable tsan for test_multiprocessing. (#27410)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27410

Similar to https://github.com/pytorch/pytorch/pull/25005, TSAN is not
safe to use in a multi-threaded program with fork and can cause deadlocks. As a
result, disabling this test for TSAN.
ghstack-source-id: 91393545

Test Plan: buildbot

Differential Revision: D17775141

fbshipit-source-id: 109b8095240ad43ee4a6380f70b9efca863c0a4a
---
 test/test_multiprocessing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index b7ab2b89bdfd4..3911f601bcac4 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -13,7 +13,7 @@
 import torch.utils.hooks
 from torch.nn import Parameter
 from common_utils import (TestCase, run_tests, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, TEST_WITH_ASAN,
-                          load_tests, slowTest)
+                          load_tests, slowTest, TEST_WITH_TSAN)
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -221,6 +221,7 @@ def _has_shm_files(self):
         return False
 
 
+@unittest.skipIf(TEST_WITH_TSAN, "TSAN is not fork-safe since we're forking in a multi-threaded environment")
 class TestMultiprocessing(TestCase):
 
     def tearDown(self):

From a24291a5549571d8a53c9f7572984ec18540025c Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Mon, 7 Oct 2019 12:49:45 -0700
Subject: [PATCH 092/113] Unfold export (#24970)

Summary:
ONNX export for Unfold in symbolic opset9 + op and ORT tests
Pull Request resolved: https://github.com/pytorch/pytorch/pull/24970

Reviewed By: hl475

Differential Revision: D17495106

Pulled By: houseroad

fbshipit-source-id: fcd179a1213c0f219628f25c09e66fcfe4c5df50
---
 .../expect/TestOperators.test_unfold.expect   | 121 ++++++++++++++++++
 test/onnx/test_operators.py                   |   4 +
 test/onnx/test_pytorch_onnx_onnxruntime.py    |   8 ++
 torch/onnx/symbolic_opset9.py                 |  16 ++-
 4 files changed, 148 insertions(+), 1 deletion(-)
 create mode 100644 test/onnx/expect/TestOperators.test_unfold.expect

diff --git a/test/onnx/expect/TestOperators.test_unfold.expect b/test/onnx/expect/TestOperators.test_unfold.expect
new file mode 100644
index 0000000000000..653ca7accc654
--- /dev/null
+++ b/test/onnx/expect/TestOperators.test_unfold.expect
@@ -0,0 +1,121 @@
+ir_version: 4
+producer_name: "pytorch"
+producer_version: "1.3"
+graph {
+  node {
+    input: "0"
+    output: "1"
+    op_type: "Slice"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "ends"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "starts"
+      ints: 0
+      type: INTS
+    }
+  }
+  node {
+    input: "0"
+    output: "2"
+    op_type: "Slice"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+    attribute {
+      name: "ends"
+      ints: 4
+      type: INTS
+    }
+    attribute {
+      name: "starts"
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    input: "1"
+    output: "3"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    input: "2"
+    output: "4"
+    op_type: "Unsqueeze"
+    attribute {
+      name: "axes"
+      ints: 2
+      type: INTS
+    }
+  }
+  node {
+    input: "3"
+    input: "4"
+    output: "5"
+    op_type: "Concat"
+    attribute {
+      name: "axis"
+      i: 2
+      type: INT
+    }
+  }
+  name: "torch-jit-export"
+  input {
+    name: "0"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 4
+          }
+        }
+      }
+    }
+  }
+  output {
+    name: "5"
+    type {
+      tensor_type {
+        elem_type: 1
+        shape {
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 3
+          }
+          dim {
+            dim_value: 2
+          }
+          dim {
+            dim_value: 2
+          }
+        }
+      }
+    }
+  }
+}
+opset_import {
+  version: 9
+}
diff --git a/test/onnx/test_operators.py b/test/onnx/test_operators.py
index 365ba55a4af7e..83c798f78101b 100644
--- a/test/onnx/test_operators.py
+++ b/test/onnx/test_operators.py
@@ -771,6 +771,10 @@ def test_frobenius_norm(self):
         x = torch.randn(2, 3, 4).float()
         self.assertONNX(lambda x: torch.norm(x, p="fro", dim=(0, 1), keepdim=True), x)
 
+    def test_unfold(self):
+        x = torch.randn(2, 3, 4, requires_grad=True)
+        self.assertONNX(lambda x: x.unfold(dimension=2, size=2, step=2), x)
+
     def test_remainder(self):
         x = torch.randn(2, 3, 4)
         y = torch.randn(2, 1, 4)
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 307d942b5b971..3c741fb8845d9 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -1114,6 +1114,14 @@ def forward(self, x):
         x = torch.randn(4, 2, 3, requires_grad=True)
         self.run_test(NormModel(), x)
 
+    def test_unfold(self):
+        class UnfoldModel(torch.nn.Module):
+            def forward(self, x):
+                return x.unfold(dimension=2, size=2, step=2)
+
+        x = torch.randn(4, 2, 3, requires_grad=True)
+        self.run_test(UnfoldModel(), x)
+
     def test_remainder(self):
         class RemainderModel(torch.nn.Module):
             def forward(self, input, other):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index b28f611c63730..19dd546ecc832 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -937,7 +937,21 @@ def instance_norm(g, input, weight, bias, running_mean, running_var, use_input_s
 
 @parse_args('v', 'i', 'i', 'i')
 def unfold(g, input, dimension, size, step):
-    return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
+    if sym_help._operator_export_type == torch.onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+        return g.op("ATen", input, operator_s="unfold", dimension_i=dimension, size_i=size, step_i=step)
+    if input.isCompleteTensor():
+        sizedim = input.type().sizes()[dimension]
+        low_indices = range(0, sizedim, step)
+        hi_indices = range(size, sizedim + 1, step)
+        stack = [sym_help._slice_helper(g, input, axes=[dimension], starts=[low], ends=[hi])
+                 for low, hi in zip(low_indices, hi_indices)]
+        ndim = input.type().dim()
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+        unsqueeze = [g.op("Unsqueeze", g.op("Transpose", t, perm_i=perm), axes_i=[dimension]) for t in stack]
+        return g.op("Concat", *unsqueeze, axis_i=dimension)
+    else:
+        return _unimplemented("Unfold", "input size not accessible")
 
 
 @parse_args('v', 'v', 'i')

From 0046092178c67422d9e34b55d76c8545116fe19f Mon Sep 17 00:00:00 2001
From: davidriazati <davidriazati@fb.com>
Date: Mon, 7 Oct 2019 13:50:23 -0700
Subject: [PATCH 093/113] Reduce special casing around 'training' (#27109)

Summary:
Most of this was old cruft left over from special handling of `training` before we had a `bool` type. This makes all modules have a `training` attribute that is true by default and removes all other special handling.

Fixes #26884
](https://our.intern.facebook.com/intern/diff/17728129/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27109

Pulled By: driazati

Differential Revision: D17728129

fbshipit-source-id: 8ddc9fbb07a953dd05529538bfdd01ed88b5cb57
---
 test/test_jit.py                              | 69 ++++++++++++++++---
 torch/csrc/jit/pickler.cpp                    |  2 +-
 torch/csrc/jit/script/init.cpp                | 10 +++
 torch/csrc/jit/script/module.cpp              |  2 +-
 torch/csrc/jit/script/module.h                |  1 +
 .../csrc/jit/script/python_sugared_value.cpp  | 23 ++-----
 torch/jit/__init__.py                         | 18 +++--
 torch/jit/_recursive.py                       |  4 --
 torch/nn/modules/module.py                    |  8 +--
 torch/nn/quantized/modules/conv.py            |  6 +-
 torch/nn/quantized/modules/linear.py          |  6 +-
 torch/quantization/_quantize_script.py        |  6 +-
 torch/utils/mkldnn.py                         | 15 ++--
 13 files changed, 112 insertions(+), 58 deletions(-)

diff --git a/test/test_jit.py b/test/test_jit.py
index 6da43694aca11..a1a09fbb24322 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -3443,6 +3443,46 @@ def forward(self, x):
         mod = torch.jit.script(MyMod())
         FileCheck().check_dag("NamedTuple").check_dag("Exception").run(mod.forward.graph)
 
+    def test_eval_python(self):
+        def _test(m):
+            self.assertTrue(m(torch.ones(2, 2)))
+            self.assertTrue(m.training)
+            self.assertTrue(m._c._get_attribute('training'))
+
+            m.eval()
+
+            self.assertFalse(m.training)
+            self.assertFalse(m._c._get_attribute('training'))
+            self.assertFalse(m(torch.ones(2, 2)))
+
+            if not PY2:
+                buffer = io.BytesIO()
+                torch.jit.save(m, buffer)
+                buffer.seek(0)
+
+                loaded = torch.jit.load(buffer)
+
+                self.assertFalse(loaded.training)
+                self.assertFalse(loaded._c._get_attribute('training'))
+
+        class M(nn.Module):
+            def __init__(self):
+                super(M, self).__init__()
+
+            def forward(self, x):
+                return self.training
+
+        class OldM(torch.jit.ScriptModule):
+            def __init__(self):
+                super(OldM, self).__init__()
+
+            @torch.jit.script_method
+            def forward(self, x):
+                return self.training
+
+        _test(torch.jit.script(M()))
+        _test(OldM())
+
     def test_inherit_method(self):
         class A(torch.jit.ScriptModule):
             def __init__(self):
@@ -3955,7 +3995,7 @@ def forward(self, xyz):
         bytesio = io.BytesIO(buffer)
         scripted = torch.jit.load(bytesio)
 
-        fc = FileCheck().check(':6:11')
+        fc = FileCheck().check(':7:11')
         fc.run(scripted.graph)
         fc.run(str(scripted.graph))
 
@@ -8316,11 +8356,12 @@ def __init__(self):
 
             @torch.jit.export
             def __getstate__(self):
-                return (3,)
+                return (3, self.training)
 
             @torch.jit.export
             def __setstate__(self, state):
                 self.a = state[0]
+                self.training = state[1]
 
             def forward(self, x):
                 return x + self.a
@@ -8346,11 +8387,12 @@ def __init__(self):
 
             @torch.jit.export
             def __getstate__(self):
-                return (3,)
+                return (3, self.training)
 
             @torch.jit.export
             def __setstate__(self, state):
                 self.a = state[0]
+                self.training = state[1]
 
             def forward(self, x):
                 return x + self.a
@@ -13476,12 +13518,13 @@ def __init__(self, number):
 
             @torch.jit.script_method
             def __getstate__(self):
-                return (self.buffer1, self.buffer2, 74)
+                return (self.buffer1, self.buffer2, 74, self.training)
 
             @torch.jit.script_method
             def __setstate__(self, state):
                 self.buffer1 = state[0] + 10
                 self.buffer2 = state[1] + 10
+                self.training = state[3]
 
 
         class M(torch.jit.ScriptModule):
@@ -13496,13 +13539,14 @@ def __init__(self, number, submodule):
 
             @torch.jit.script_method
             def __getstate__(self):
-                return (self.buffer1, self.buffer2, 74, self.submodule)
+                return (self.buffer1, self.buffer2, 74, self.submodule, self.training)
 
             @torch.jit.script_method
             def __setstate__(self, state):
                 self.buffer1 = state[0] + 10
                 self.buffer2 = state[1] + 10
                 self.submodule = state[3]
+                self.training = state[4]
 
         with TemporaryFileName() as fname:
             m = M(23, submodule=Root(99))
@@ -13533,12 +13577,13 @@ def forward(self):
 
             @torch.jit.export
             def __getstate__(self):
-                return 5
+                return (5, self.training)
 
             @torch.jit.export
             def __setstate__(self, state):
-                self.buffer1 = torch.ones(2, 2) + state
+                self.buffer1 = torch.ones(2, 2) + state[0]
                 self.buffer2 = torch.ones(2, 2) + 10
+                self.training = state[1]
 
         with TemporaryFileName() as fname:
             m = torch.jit.script(NoArgState())
@@ -15164,12 +15209,13 @@ def __init__(self):
 
             @torch.jit.export
             def __getstate__(self):
-                return (self.tensor,)
+                return (self.tensor, self.training)
 
             @torch.jit.export
             def __setstate__(self, state):
-                # type: (Tuple[Tensor])
+                # type: (Tuple[Tensor, bool])
                 self.tensor = state[0]
+                self.training = state[1]
 
             def forward(self, x):
                 return x + self.tensor
@@ -15249,7 +15295,7 @@ def __init__(self, in_features, out_features):
 
             @torch.jit.export
             def __getstate__(self):
-                return torch.ops.quantized.linear_unpack(self._packed_weight)[0]
+                return (torch.ops.quantized.linear_unpack(self._packed_weight)[0], self.training)
 
             def forward(self):
                 return self._packed_weight
@@ -15257,7 +15303,8 @@ def forward(self):
             @torch.jit.export
             def __setstate__(self, state):
                 self._packed_weight.set_(
-                    torch.ops.quantized.linear_prepack(state))
+                    torch.ops.quantized.linear_prepack(state[0]))
+                self.training = state[1]
 
             @property
             def weight(self):
diff --git a/torch/csrc/jit/pickler.cpp b/torch/csrc/jit/pickler.cpp
index e70e8f14ec25a..0559e905f9d8f 100644
--- a/torch/csrc/jit/pickler.cpp
+++ b/torch/csrc/jit/pickler.cpp
@@ -582,7 +582,7 @@ bool checkHasValidSetGetState(const std::shared_ptr<c10::ClassType>& cls) {
   auto set_type = set_schema.arguments().at(1).type();
 
   TORCH_CHECK(
-      set_type->isSubtypeOf(get_type),
+      get_type->isSubtypeOf(set_type),
       "'__getstate__'s return type (",
       get_type->python_str(),
       ") does not match '__setstate__'s argument type (",
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 6c22edbb4adf8..cc90dde4404c6 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -709,6 +709,14 @@ void initJitScriptBindings(PyObject* module) {
              const std::string& filename,
              const ExtraFilesMap& _extra_files = ExtraFilesMap()) {
             Module module("__torch__.PlaceholderModule");
+            // [issue 27343]
+            // Modules have 'training' attributes by defualt, but due to
+            // https://github.com/pytorch/pytorch/issues/27343, functions end
+            // up having a training attribute when they are loaded. This adds
+            // a fake 'training' attribute that shouldn't be used, but prevents
+            // jitter on saving and loading. Once that issue is fixed this can
+            // be deleted.
+            module.register_attribute("training", BoolType::get(), true);
             addFunctionToModule(module, self);
             module.save(filename, _extra_files);
           },
@@ -720,6 +728,8 @@ void initJitScriptBindings(PyObject* module) {
              const ExtraFilesMap& _extra_files = ExtraFilesMap()) {
             std::ostringstream buf;
             Module module("__torch__.PlaceholderModule");
+            // see [issue 27343]
+            module.register_attribute("training", BoolType::get(), true);
             addFunctionToModule(module, self);
             module.save(buf, _extra_files);
             return py::bytes(buf.str());
diff --git a/torch/csrc/jit/script/module.cpp b/torch/csrc/jit/script/module.cpp
index 81544f5c89df5..e572629ca5542 100644
--- a/torch/csrc/jit/script/module.cpp
+++ b/torch/csrc/jit/script/module.cpp
@@ -354,7 +354,7 @@ void Module::train(bool on) {
   if (auto slot = find_attribute("training")) {
     slot->setValue(on);
   } else {
-    register_attribute("training", BoolType::get(), on);
+    TORCH_INTERNAL_ASSERT("'training' attribute not found");
   }
 }
 
diff --git a/torch/csrc/jit/script/module.h b/torch/csrc/jit/script/module.h
index 17088b76ac514..dd36df7f9174b 100644
--- a/torch/csrc/jit/script/module.h
+++ b/torch/csrc/jit/script/module.h
@@ -259,6 +259,7 @@ struct TORCH_API Module {
     if (auto p = find_attribute("training")) {
       return p->value().toBool();
     }
+
     // We are in training mode by default
     return true;
   }
diff --git a/torch/csrc/jit/script/python_sugared_value.cpp b/torch/csrc/jit/script/python_sugared_value.cpp
index 90259466cac08..275b716bfe787 100644
--- a/torch/csrc/jit/script/python_sugared_value.cpp
+++ b/torch/csrc/jit/script/python_sugared_value.cpp
@@ -339,21 +339,6 @@ std::shared_ptr<SugaredValue> ModuleValue::attr(
     const SourceRange& loc,
     Function& m,
     const std::string& field) {
-  // workaround to make self.training work
-  // it adds a buffer 'training' to the model if one doesn't exist
-  // and then loads that parameter, casting it to bool
-  if (field == "training") {
-    c10::optional<Slot> v = module_.find_attribute(field);
-    if (!v) {
-      bool training = py::cast<bool>(py::getattr(py_module_, "training"));
-      module_.register_attribute(
-          "training", BoolType::get(), std::move(training));
-      v = module_.find_attribute(field);
-    }
-    Value* the_bool = m.graph()->insertGetAttr(self_, "training");
-    return std::make_shared<SimpleValue>(the_bool);
-  }
-
   if (auto v = module_.find_module(field)) {
     return std::make_shared<ModuleValue>(
         m.graph()->insertGetAttr(self_, field),
@@ -503,19 +488,25 @@ std::shared_ptr<SugaredValue> BooleanDispatchValue::call(
   auto index = py::cast<size_t>(dispatched_fn_["index"]);
   auto arg_name = py::str(dispatched_fn_["arg_name"]);
 
+  ErrorReport error(loc);
   if (index < inputs.size()) {
     // Dispatch flag is in arg list
     result = constant_as<bool>(inputs.at(index).value(graph));
+    error << "Argument for boolean dispatch at position " << index
+          << " was not constant";
   } else if (auto i = findInputWithName(arg_name, attributes)) {
     // Dispatch flag is in kwargs
     result = constant_as<bool>(attributes[*i].value(graph));
+    error << "Keyword argument '" << arg_name
+          << "' for boolean dispatch at position was not constant";
   } else {
     // Didn't find dispatch flag, so use default value
     result = py::cast<bool>(dispatched_fn_["default"]);
+    TORCH_INTERNAL_ASSERT(result);
   }
 
   if (!result) {
-    throw ErrorReport(loc) << "value for boolean dispatch was not constant";
+    throw error;
   }
 
   std::shared_ptr<SugaredValue> value;
diff --git a/torch/jit/__init__.py b/torch/jit/__init__.py
index 62c3835c1251f..72ee285e7bee8 100644
--- a/torch/jit/__init__.py
+++ b/torch/jit/__init__.py
@@ -1515,8 +1515,15 @@ def __init__(self, optimize=None, _qualified_name=None, _compilation_unit=None,
             else:
                 self.__dict__['_c'] = torch._C.ScriptModule(_qualified_name, _compilation_unit, True)
 
-            Module._Module__construct(self)
-            Module.__setattr__(self, "training", True)
+            training = None
+            if self._c._has_attribute('training'):
+                training = self._c._get_attribute('training')
+            super(ScriptModule, self).__init__()
+            if training is not None:
+                self.training = training
+                self._c._register_attribute('training', torch._C.BoolType.get(), training)
+            elif not self._c._has_attribute('training'):
+                self._c._register_attribute('training', torch._C.BoolType.get(), self.training)
 
             self._parameters = OrderedParameterDict(self._c)
             self._buffers = OrderedBufferDict(self._c)
@@ -1582,11 +1589,6 @@ def __getattr__(self, attr):
 
         def __setattr__(self, attr, value):
             if attr not in self._constants_set:
-                if attr == 'training':
-                    if self._c._has_attribute('training'):
-                        self.__dict__['training'] = value
-                        self._c._set_attribute('training', value)
-                        return
                 if isinstance(value, Attribute):
                     the_type = torch.jit.annotations.ann_to_type(value.type)
                     try:
@@ -1595,6 +1597,8 @@ def __setattr__(self, attr, value):
                         raise RuntimeError("Could not register attribute '{}' of type '{}' for a value of type '{}'"
                                            .format(attr, value.type, type(value.value)))
                     return
+                if self._c._has_attribute(attr):
+                    self._c._set_attribute(attr, value)
                 return super(ScriptModule, self).__setattr__(attr, value)
 
             if hasattr(self, attr):
diff --git a/torch/jit/_recursive.py b/torch/jit/_recursive.py
index c9706a2242046..68d3b3a9f5bd1 100644
--- a/torch/jit/_recursive.py
+++ b/torch/jit/_recursive.py
@@ -58,10 +58,6 @@ def copy_to_script_module(original, stubs):
     # the type if possible
     class_annotations = getattr(original, '__annotations__', {})
     for name in dir(original):
-        if name in ("training", "__dict__"):
-            # TODO: removing this skip should let us remove the code to add training as an
-            # attribute in python_sugared_value.cpp
-            continue
         if hasattr(script_module, name):
             # Don't re-copy properties
             continue
diff --git a/torch/nn/modules/module.py b/torch/nn/modules/module.py
index 282d2dc30819d..7762768f3819b 100644
--- a/torch/nn/modules/module.py
+++ b/torch/nn/modules/module.py
@@ -68,16 +68,12 @@ def forward(self, x):
     _version = 1
 
     def __init__(self):
-        self.__construct()
-        # initialize self.training separately from the rest of the internal
-        # state, as it is managed differently by nn.Module and ScriptModule
-        self.training = True
-
-    def __construct(self):
         """
         Initializes internal Module state, shared by both nn.Module and ScriptModule.
         """
         torch._C._log_api_usage_once("python.nn_module")
+
+        self.training = True
         self._parameters = OrderedDict()
         self._buffers = OrderedDict()
         self._backward_hooks = OrderedDict()
diff --git a/torch/nn/quantized/modules/conv.py b/torch/nn/quantized/modules/conv.py
index fa0f166c005e4..2ae5248c26a72 100644
--- a/torch/nn/quantized/modules/conv.py
+++ b/torch/nn/quantized/modules/conv.py
@@ -166,7 +166,8 @@ def __getstate__(self):
             w,
             b,
             self.scale,
-            self.zero_point
+            self.zero_point,
+            self.training
         )
 
     # ===== Deserialization methods =====
@@ -190,7 +191,7 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 
     @torch.jit.export
     def __setstate__(self, state):
-        # type: (Tuple[int, int, Tuple[int, int], Tuple[int, int], Tuple[int, int], Tuple[int, int], bool, int, int, str, Tensor, Optional[Tensor], float, int])  # noqa
+        # type: (Tuple[int, int, Tuple[int, int], Tuple[int, int], Tuple[int, int], Tuple[int, int], bool, int, int, str, Tensor, Optional[Tensor], float, int, bool])  # noqa
         self.in_channels = state[0]
         self.out_channels = state[1]
         self.kernel_size = state[2]
@@ -204,6 +205,7 @@ def __setstate__(self, state):
         self.set_weight_bias(state[10], state[11])
         self.scale = state[12]
         self.zero_point = state[13]
+        self.training = state[14]
 
     @classmethod
     def from_float(cls, mod):
diff --git a/torch/nn/quantized/modules/linear.py b/torch/nn/quantized/modules/linear.py
index 9220fcb5a415c..449d19ef9a936 100644
--- a/torch/nn/quantized/modules/linear.py
+++ b/torch/nn/quantized/modules/linear.py
@@ -158,7 +158,8 @@ def __getstate__(self):
             b,
             w,
             self.scale,
-            self.zero_point
+            self.zero_point,
+            self.training
         )
 
     # ===== Deserialization methods =====
@@ -181,12 +182,13 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 
     @torch.jit.export
     def __setstate__(self, state):
-        # type: (Tuple[int, int, Optional[torch.Tensor], torch.Tensor, float, int]) -> None
+        # type: (Tuple[int, int, Optional[torch.Tensor], torch.Tensor, float, int, bool]) -> None
         self.in_features = state[0]
         self.out_features = state[1]
         self.set_weight_bias(state[3], state[2])
         self.scale = state[4]
         self.zero_point = state[5]
+        self.training = state[6]
 
     # Function rather than property to make sure that JIT serialization doesn't
     # register this as an attribute
diff --git a/torch/quantization/_quantize_script.py b/torch/quantization/_quantize_script.py
index aa83dd813d0b2..722f04723ae24 100644
--- a/torch/quantization/_quantize_script.py
+++ b/torch/quantization/_quantize_script.py
@@ -24,11 +24,13 @@ def forward(self, x):
 
     @torch.jit.export
     def __getstate__(self):
-        return self._weight_bias()
+        return self._weight_bias(), self.training
 
     @torch.jit.export
     def __setstate__(self, state):
-        self.set_weight_bias(state[0], state[1])
+        # type: (Tuple[Tuple[Tensor, Optional[Tensor]], bool]) -> None
+        self.set_weight_bias(state[0][0], state[0][1])
+        self.training = state[1]
 
 def _check_is_script_module(model):
     if not isinstance(model, torch.jit.ScriptModule):
diff --git a/torch/utils/mkldnn.py b/torch/utils/mkldnn.py
index 73524f8a52998..958f409505af1 100644
--- a/torch/utils/mkldnn.py
+++ b/torch/utils/mkldnn.py
@@ -17,13 +17,14 @@ def __init__(self, dense_module):
 
     @torch.jit.script_method
     def __getstate__(self):
-        return (self.weight.to_dense(), self.bias.to_dense())
+        return (self.weight.to_dense(), self.bias.to_dense(), self.training)
 
     @torch.jit.script_method
     def __setstate__(self, state):
-        # type: (Tuple[Tensor, Tensor]) -> None
+        # type: (Tuple[Tensor, Tensor, bool]) -> None
         self.weight = state[0].to_mkldnn()
         self.bias = state[1].to_mkldnn()
+        self.training = state[2]
 
     @torch.jit.script_method
     def forward(self, x):
@@ -55,11 +56,11 @@ def __init__(self, dense_module):
 
     @torch.jit.script_method
     def __getstate__(self):
-        return (self.weight.to_dense(), self.bias.to_dense())
+        return (self.weight.to_dense(), self.bias.to_dense(), self.training)
 
     @torch.jit.script_method
     def __setstate__(self, state):
-        # type: (Tuple[Tensor, Tensor]) -> None
+        # type: (Tuple[Tensor, Tensor, bool]) -> None
         self.weight = torch._C._nn.mkldnn_reorder_conv2d_weight(
             state[0].to_mkldnn(),
             self.padding,
@@ -67,6 +68,7 @@ def __setstate__(self, state):
             self.dilation,
             self.groups)
         self.bias = state[1].to_mkldnn()
+        self.training = state[2]
 
     @torch.jit.script_method
     def forward(self, x):
@@ -107,15 +109,16 @@ def __getstate__(self):
         bias = self.bias.to_dense()
         running_mean = self.running_mean.to_dense()
         running_var = self.running_var.to_dense()
-        return (weight, bias, running_mean, running_var)
+        return (weight, bias, running_mean, running_var, self.training)
 
     @torch.jit.script_method
     def __setstate__(self, state):
-        # type: (Tuple[Tensor, Tensor, Tensor, Tensor]) -> None
+        # type: (Tuple[Tensor, Tensor, Tensor, Tensor, bool]) -> None
         self.weight = state[0].to_mkldnn()
         self.bias = state[1].to_mkldnn()
         self.running_mean = state[2].to_mkldnn()
         self.running_var = state[3].to_mkldnn()
+        self.training = state[4]
 
     @torch.jit.script_method
     def forward(self, x):

From a4cba50d6253bd51ba410e723d6036112e79a224 Mon Sep 17 00:00:00 2001
From: Natalia Lunova <natalial@fb.com>
Date: Mon, 7 Oct 2019 14:09:00 -0700
Subject: [PATCH 094/113] Put metrics back to torch.utils.tensorboard similar
 we have in TensorboardX

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27252

Test Plan: Check metrics in the Scuba table: https://fburl.com/scuba/k5x8yosj

Reviewed By: sanekmelnikov

Differential Revision: D17723414

fbshipit-source-id: 64d42e0b4582f635d38f38feb2b2a6c4826f2065
---
 torch/utils/tensorboard/writer.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index 01e27b5813915..f08b367aae961 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -287,6 +287,7 @@ def add_hparams(self, hparam_dict=None, metric_dict=None):
         .. image:: _static/img/tensorboard/add_hparam.png
            :scale: 50 %
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_hparams")
         if type(hparam_dict) is not dict or type(metric_dict) is not dict:
             raise TypeError('hparam_dict and metric_dict should be dictionary.')
         exp, ssi, sei = hparams(hparam_dict, metric_dict)
@@ -323,6 +324,7 @@ def add_scalar(self, tag, scalar_value, global_step=None, walltime=None):
            :scale: 50 %
 
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_scalar")
         if self._check_caffe2_blob(scalar_value):
             scalar_value = workspace.FetchBlob(scalar_value)
         self._get_file_writer().add_summary(
@@ -359,6 +361,7 @@ def add_scalars(self, main_tag, tag_scalar_dict, global_step=None, walltime=None
            :scale: 50 %
 
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_scalars")
         walltime = time.time() if walltime is None else walltime
         fw_logdir = self._get_file_writer().get_logdir()
         for tag, scalar_value in tag_scalar_dict.items():
@@ -402,6 +405,7 @@ def add_histogram(self, tag, values, global_step=None, bins='tensorflow', wallti
            :scale: 50 %
 
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_histogram")
         if self._check_caffe2_blob(values):
             values = workspace.FetchBlob(values)
         if isinstance(bins, six.string_types) and bins == 'tensorflow':
@@ -461,6 +465,7 @@ def add_histogram_raw(self, tag, min, max, num, sum, sum_squares,
            :scale: 50 %
 
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_histogram_raw")
         if len(bucket_limits) != len(bucket_counts):
             raise ValueError('len(bucket_limits) != len(bucket_counts), see the document.')
         self._get_file_writer().add_summary(
@@ -517,6 +522,7 @@ def add_image(self, tag, img_tensor, global_step=None, walltime=None, dataformat
            :scale: 50 %
 
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_image")
         if self._check_caffe2_blob(img_tensor):
             img_tensor = workspace.FetchBlob(img_tensor)
         self._get_file_writer().add_summary(
@@ -559,6 +565,7 @@ def add_images(self, tag, img_tensor, global_step=None, walltime=None, dataforma
            :scale: 30 %
 
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_images")
         if self._check_caffe2_blob(img_tensor):
             img_tensor = workspace.FetchBlob(img_tensor)
         self._get_file_writer().add_summary(
@@ -585,6 +592,7 @@ def add_image_with_boxes(self, tag, img_tensor, box_tensor, global_step=None,
             box_tensor: (torch.Tensor, numpy.array, or string/blobname): NX4,  where N is the number of
             boxes and each 4 elememts in a row represents (xmin, ymin, xmax, ymax).
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_image_with_boxes")
         if self._check_caffe2_blob(img_tensor):
             img_tensor = workspace.FetchBlob(img_tensor)
         if self._check_caffe2_blob(box_tensor):
@@ -605,6 +613,7 @@ def add_figure(self, tag, figure, global_step=None, close=True, walltime=None):
             walltime (float): Optional override default walltime (time.time())
               seconds after epoch of event
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_figure")
         if isinstance(figure, list):
             self.add_image(tag, figure_to_image(figure, close), global_step, walltime, dataformats='NCHW')
         else:
@@ -625,6 +634,7 @@ def add_video(self, tag, vid_tensor, global_step=None, fps=4, walltime=None):
         Shape:
             vid_tensor: :math:`(N, T, C, H, W)`. The values should lie in [0, 255] for type `uint8` or [0, 1] for type `float`.
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_video")
         self._get_file_writer().add_summary(
             video(tag, vid_tensor, fps), global_step, walltime)
 
@@ -641,6 +651,7 @@ def add_audio(self, tag, snd_tensor, global_step=None, sample_rate=44100, wallti
         Shape:
             snd_tensor: :math:`(1, L)`. The values should lie between [-1, 1].
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_audio")
         if self._check_caffe2_blob(snd_tensor):
             snd_tensor = workspace.FetchBlob(snd_tensor)
         self._get_file_writer().add_summary(
@@ -660,15 +671,18 @@ def add_text(self, tag, text_string, global_step=None, walltime=None):
             writer.add_text('lstm', 'This is an lstm', 0)
             writer.add_text('rnn', 'This is an rnn', 10)
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_text")
         self._get_file_writer().add_summary(
             text(tag, text_string), global_step, walltime)
 
     def add_onnx_graph(self, prototxt):
+        torch._C._log_api_usage_once("tensorboard.logging.add_onnx_graph")
         self._get_file_writer().add_onnx_graph(load_onnx_graph(prototxt))
 
     def add_graph(self, model, input_to_model=None, verbose=False):
         # prohibit second call?
         # no, let tensorboard handle it and show its warning message.
+        torch._C._log_api_usage_once("tensorboard.logging.add_graph")
         """Add graph data to summary.
 
         Args:
@@ -742,6 +756,7 @@ def add_embedding(self, mat, metadata=None, label_img=None, global_step=None, ta
             writer.add_embedding(torch.randn(100, 5), label_img=label_img)
             writer.add_embedding(torch.randn(100, 5), metadata=meta)
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_embedding")
         mat = make_np(mat)
         if global_step is None:
             global_step = 0
@@ -800,6 +815,7 @@ def add_pr_curve(self, tag, labels, predictions, global_step=None,
             writer.close()
 
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_pr_curve")
         labels, predictions = make_np(labels), make_np(predictions)
         self._get_file_writer().add_summary(
             pr_curve(tag, labels, predictions, num_thresholds, weights),
@@ -831,6 +847,7 @@ def add_pr_curve_raw(self, tag, true_positive_counts,
               seconds after epoch of event
             see: https://github.com/tensorflow/tensorboard/blob/master/tensorboard/plugins/pr_curve/README.md
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_pr_curve_raw")
         self._get_file_writer().add_summary(
             pr_curve_raw(tag,
                          true_positive_counts,
@@ -855,6 +872,7 @@ def add_custom_scalars_multilinechart(self, tags, category='default', title='unt
 
             writer.add_custom_scalars_multilinechart(['twse/0050', 'twse/2330'])
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_custom_scalars_multilinechart")
         layout = {category: {title: ['Multiline', tags]}}
         self._get_file_writer().add_summary(custom_scalars(layout))
 
@@ -869,6 +887,7 @@ def add_custom_scalars_marginchart(self, tags, category='default', title='untitl
 
             writer.add_custom_scalars_marginchart(['twse/0050', 'twse/2330', 'twse/2006'])
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_custom_scalars_marginchart")
         assert len(tags) == 3
         layout = {category: {title: ['Margin', tags]}}
         self._get_file_writer().add_summary(custom_scalars(layout))
@@ -892,6 +911,7 @@ def add_custom_scalars(self, layout):
 
             writer.add_custom_scalars(layout)
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_custom_scalars")
         self._get_file_writer().add_summary(custom_scalars(layout))
 
     def add_mesh(self, tag, vertices, colors=None, faces=None, config_dict=None, global_step=None, walltime=None):
@@ -945,6 +965,7 @@ def add_mesh(self, tag, vertices, colors=None, faces=None, config_dict=None, glo
 
             writer.close()
         """
+        torch._C._log_api_usage_once("tensorboard.logging.add_mesh")
         self._get_file_writer().add_summary(mesh(tag, vertices, colors, faces, config_dict), global_step, walltime)
 
     def flush(self):

From 8cc9d276476a2793cdc8e073ce4346bd16258cf9 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Mon, 7 Oct 2019 14:14:55 -0700
Subject: [PATCH 095/113] Automatic update of fbcode/onnx to
 2891e1459745933f4bba9a8cb3371cf3c9eb1d16 (#27474)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27474

Previous import was 034921bd574cc84906b7996c07873454b7dd4135

Included changes:
- **[2891e145](https://github.com/onnx/onnx/commit/2891e145)**: Fix Unique unit test (#2381) <Scott McKay>
- **[25cf73e5](https://github.com/onnx/onnx/commit/25cf73e5)**: update shapeInference h file link (#2369) <prcvih>
- **[e3074bc0](https://github.com/onnx/onnx/commit/e3074bc0)**: modify file path (#2378) <prcvih>
- **[9058d3a4](https://github.com/onnx/onnx/commit/9058d3a4)**: Incrementing version number to 1.6.0 (#2353) (#2385) <Kevin Chen>
- **[c963586d](https://github.com/onnx/onnx/commit/c963586d)**: Remove typing packages from test requirements (#2375) <Aiken Cairncross>

Test Plan: ci

Reviewed By: bddppq

Differential Revision: D17791527

fbshipit-source-id: 23ad5abe313cd4e4eedcbe7794b98450b3b7d3bc
---
 third_party/onnx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/onnx b/third_party/onnx
index 034921bd574cc..2891e14597459 160000
--- a/third_party/onnx
+++ b/third_party/onnx
@@ -1 +1 @@
-Subproject commit 034921bd574cc84906b7996c07873454b7dd4135
+Subproject commit 2891e1459745933f4bba9a8cb3371cf3c9eb1d16

From 16454095e09eab2e737d99ad569cd44bb7910f03 Mon Sep 17 00:00:00 2001
From: Negin Raoof <neginmr@utexas.edu>
Date: Mon, 7 Oct 2019 14:22:42 -0700
Subject: [PATCH 096/113] Fixed Select symbolic to export slice when index =
 negative one (#25273)

Summary:
Exporting torch.select when index = negative one (x[:,-1]) was broken. This PR has the fix in symbolic function for select.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/25273

Reviewed By: hl475

Differential Revision: D17159707

Pulled By: houseroad

fbshipit-source-id: 2c3b275421082758f1b63c1c9b6e578f03ca9f76
---
 caffe2/onnx/backend.cc                        | 18 ++++++--------
 .../TestOperators.test_dyn_arange.expect      | 12 +++++-----
 .../expect/TestOperators.test_full.expect     | 24 +++++++++----------
 ...TestOperators.test_upsample_nearest.expect | 24 +++++++++----------
 .../TestOperators.test_view_flatten.expect    | 24 +++++++++----------
 test/onnx/test_onnx_opset.py                  | 16 +++++++------
 test/onnx/test_pytorch_onnx_caffe2.py         |  1 -
 test/onnx/test_pytorch_onnx_onnxruntime.py    |  3 +--
 torch/onnx/symbolic_opset9.py                 | 14 +++++------
 9 files changed, 66 insertions(+), 70 deletions(-)

diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index e0943294580c1..284be62b96273 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -709,20 +709,16 @@ Caffe2Ops Caffe2Backend::CreateGather(
   std::vector<std::string> inputs;
   inputs.emplace_back(node.input(0));
   inputs.emplace_back(node.input(1));
+
+  auto axis = onnx_node->attributes.get<int64_t>("axis", 0L);
+  caffe2::Argument arg_axis;
+  arg_axis.set_name("axis");
+  arg_axis.set_i(axis);
+
   std::vector<std::string> outputs;
   outputs.emplace_back(node.output(0));
 
-  auto axis = onnx_node->attributes.get<int64_t>("axis", 0L);
-  if (axis == 0) {
-    BuildOperator(c2_op, "Gather", inputs, outputs);
-  } else if (axis == 1) {
-    BuildOperator(c2_op, "BatchGather", inputs, outputs);
-  } else {
-    CAFFE_THROW(
-        "Caffe2 only supports Gather with axis being 0 or 1, ",
-        "whereas axis is ",
-        axis);
-  }
+  BuildOperator(c2_op, "Gather", inputs, outputs, {arg_axis});
 
   return ret;
 }
diff --git a/test/onnx/expect/TestOperators.test_dyn_arange.expect b/test/onnx/expect/TestOperators.test_dyn_arange.expect
index 552114be962bb..2dd557d6af4db 100644
--- a/test/onnx/expect/TestOperators.test_dyn_arange.expect
+++ b/test/onnx/expect/TestOperators.test_dyn_arange.expect
@@ -3,7 +3,12 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
+    input: "0"
     output: "1"
+    op_type: "Shape"
+  }
+  node {
+    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -15,13 +20,8 @@ graph {
     }
   }
   node {
-    input: "0"
-    output: "2"
-    op_type: "Shape"
-  }
-  node {
-    input: "2"
     input: "1"
+    input: "2"
     output: "3"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/expect/TestOperators.test_full.expect b/test/onnx/expect/TestOperators.test_full.expect
index d13667ef84757..c11d3e94c1f8a 100644
--- a/test/onnx/expect/TestOperators.test_full.expect
+++ b/test/onnx/expect/TestOperators.test_full.expect
@@ -3,7 +3,12 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
+    input: "0"
     output: "1"
+    op_type: "Shape"
+  }
+  node {
+    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -15,13 +20,8 @@ graph {
     }
   }
   node {
-    input: "0"
-    output: "2"
-    op_type: "Shape"
-  }
-  node {
-    input: "2"
     input: "1"
+    input: "2"
     output: "3"
     op_type: "Gather"
     attribute {
@@ -31,7 +31,12 @@ graph {
     }
   }
   node {
+    input: "0"
     output: "4"
+    op_type: "Shape"
+  }
+  node {
+    output: "5"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -43,13 +48,8 @@ graph {
     }
   }
   node {
-    input: "0"
-    output: "5"
-    op_type: "Shape"
-  }
-  node {
-    input: "5"
     input: "4"
+    input: "5"
     output: "6"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest.expect b/test/onnx/expect/TestOperators.test_upsample_nearest.expect
index 16451536e1819..baf27592822b0 100644
--- a/test/onnx/expect/TestOperators.test_upsample_nearest.expect
+++ b/test/onnx/expect/TestOperators.test_upsample_nearest.expect
@@ -3,7 +3,12 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
+    input: "input"
     output: "1"
+    op_type: "Shape"
+  }
+  node {
+    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -15,13 +20,8 @@ graph {
     }
   }
   node {
-    input: "input"
-    output: "2"
-    op_type: "Shape"
-  }
-  node {
-    input: "2"
     input: "1"
+    input: "2"
     output: "3"
     op_type: "Gather"
     attribute {
@@ -74,7 +74,12 @@ graph {
     op_type: "Floor"
   }
   node {
+    input: "input"
     output: "9"
+    op_type: "Shape"
+  }
+  node {
+    output: "10"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -86,13 +91,8 @@ graph {
     }
   }
   node {
-    input: "input"
-    output: "10"
-    op_type: "Shape"
-  }
-  node {
-    input: "10"
     input: "9"
+    input: "10"
     output: "11"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect
index 4d81920b95f75..9909c19c15265 100644
--- a/test/onnx/expect/TestOperators.test_view_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_view_flatten.expect
@@ -3,7 +3,12 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
+    input: "0"
     output: "1"
+    op_type: "Shape"
+  }
+  node {
+    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -15,13 +20,8 @@ graph {
     }
   }
   node {
-    input: "0"
-    output: "2"
-    op_type: "Shape"
-  }
-  node {
-    input: "2"
     input: "1"
+    input: "2"
     output: "3"
     op_type: "Gather"
     attribute {
@@ -31,7 +31,12 @@ graph {
     }
   }
   node {
+    input: "0"
     output: "4"
+    op_type: "Shape"
+  }
+  node {
+    output: "5"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -43,13 +48,8 @@ graph {
     }
   }
   node {
-    input: "0"
-    output: "5"
-    op_type: "Shape"
-  }
-  node {
-    input: "5"
     input: "4"
+    input: "5"
     output: "6"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index 0efe3dd875d63..cfcd510b5940f 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -186,7 +186,7 @@ def forward(self, x):
                    "attributes" : []}]
         ops = {9 : ops_9, 10 : ops_10}
         x = torch.randn(3)
-        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
+        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9])
 
         class DynamicSliceModel(torch.jit.ScriptModule):
             @torch.jit.script_method
@@ -196,6 +196,7 @@ def forward(self, x):
         ops_9 = [{"op_name" : "Constant"},
                  {"op_name" : "Constant"},
                  {"op_name" : "Shape"},
+                 {"op_name": "Constant"},
                  {"op_name" : "Gather",
                  "attributes" : [{"name" : "axis", "i" : 0, "type" : 2}]},
                  {"op_name" : "Unsqueeze",
@@ -208,6 +209,7 @@ def forward(self, x):
         ops_10 = [{"op_name" : "Constant"},
                   {"op_name" : "Constant"},
                   {"op_name" : "Shape"},
+                  {"op_name": "Constant"},
                   {"op_name" : "Gather",
                    "attributes" : [{"name" : "axis", "i" : 0, "type" : 2}]},
                   {"op_name" : "Unsqueeze",
@@ -284,11 +286,11 @@ def forward(self, x):
                 return torch.nn.functional.interpolate(x,
                                                        size=size,
                                                        mode='nearest')
-        ops_9 = [{"op_name" : "Constant"},
-                 {"op_name" : "Shape"},
-                 {"op_name" : "Gather"},
+        ops_9 = [{"op_name" : "Shape"},
                  {"op_name" : "Constant"},
+                 {"op_name" : "Gather"},
                  {"op_name" : "Shape"},
+                 {"op_name" : "Constant"},
                  {"op_name" : "Gather"},
                  {"op_name" : "Constant"},
                  {"op_name" : "Mul"},
@@ -307,11 +309,11 @@ def forward(self, x):
                  {"op_name" : "Upsample",
                   "attributes" :
                   [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-        ops_10 = [{"op_name" : "Constant"},
-                  {"op_name" : "Shape"},
-                  {"op_name" : "Gather"},
+        ops_10 = [{"op_name" : "Shape"},
                   {"op_name" : "Constant"},
+                  {"op_name" : "Gather"},
                   {"op_name" : "Shape"},
+                  {"op_name" : "Constant"},
                   {"op_name" : "Gather"},
                   {"op_name" : "Constant"},
                   {"op_name" : "Mul"},
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index 32ff6d97c56f6..fac6715f167c5 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -1306,7 +1306,6 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
 
-    @unittest.skip('https://github.com/pytorch/pytorch/issues/10984')
     @skipIfUnsupportedOpsetVersion([10])
     def test_neg_slice_large_negone(self):
         class NegSlice(torch.nn.Module):
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 3c741fb8845d9..11af41143c314 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -345,12 +345,11 @@ def forward(self, x):
     def test_slice_neg_large(self):
         class NegSlice(torch.nn.Module):
             def forward(self, x):
-                return x[:, :, :, :, -3]
+                return x[:, :, -3:-1, :, -1]
 
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_test(NegSlice(), x)
 
-    @unittest.skip('https://github.com/pytorch/pytorch/issues/10984')
     def test_slice_neg_large_negone(self):
         class NegSlice(torch.nn.Module):
             def forward(self, x):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index 19dd546ecc832..a174788005ec8 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -394,13 +394,13 @@ def split_with_sizes(g, self, split_sizes, dim):
 
 @parse_args('v', 'i', 'v')
 def select(g, self, dim, index):
-    if dim > 1:
-        # TODO: this is a temporary hack because of the implementation details
-        # of Gather in caffe2. We need to change this as soon as possible.
-        # TODO: this breaks if index == -1
-        index_val = _parse_arg(index, 'i')
-        slice_node = sym_help._slice_helper(g, self, axes=[dim],
-                                            starts=[index_val], ends=[index_val + 1])
+    index = sym_help._maybe_get_scalar(index)
+    if (not sym_help._is_value(index)) and (index < 0):
+        if index == -1:
+            end_index = 9223372036854775807
+        else:
+            end_index = index + 1
+        slice_node = sym_help._slice_helper(g, self, axes=[dim], starts=[index], ends=[end_index])
         return g.op("Squeeze", slice_node, axes_i=[dim])
     else:
         return g.op("Gather", self, index, axis_i=dim)

From aab9673e8d2ed1555dd2295d0ae08c608eb106db Mon Sep 17 00:00:00 2001
From: Nikita Shulga <nikita.shulga@oculus.com>
Date: Mon, 7 Oct 2019 16:32:24 -0700
Subject: [PATCH 097/113] Avoid variable shadowing in
 ``::at::philox_engine::single_round()`` (#27486)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27486

Rename `key` argument of `single_round` method to `in_key`

Test Plan: CI

Reviewed By: stepancheg, soumith

Differential Revision: D17782904

fbshipit-source-id: 6feae55c407f39d41db099b013dcbd3990768603
---
 aten/src/ATen/core/PhiloxRNGEngine.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/aten/src/ATen/core/PhiloxRNGEngine.h b/aten/src/ATen/core/PhiloxRNGEngine.h
index 9a283f1d06d66..1e597e8fc1f64 100644
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@@ -180,15 +180,15 @@ class philox_engine {
     #endif
   }
 
-  C10_HOST_DEVICE inline detail::UINT4 single_round(detail::UINT4 ctr, detail::UINT2 key) {
+  C10_HOST_DEVICE inline detail::UINT4 single_round(detail::UINT4 ctr, detail::UINT2 in_key) {
     uint32_t hi0;
     uint32_t hi1;
     uint32_t lo0 = mulhilo32(kPhiloxSA, ctr[0], &hi0);
     uint32_t lo1 = mulhilo32(kPhiloxSB, ctr[2], &hi1);
     detail::UINT4 ret;
-    ret[0] = hi1 ^ ctr[1] ^ key[0];
+    ret[0] = hi1 ^ ctr[1] ^ in_key[0];
     ret[1] = lo1;
-    ret[2] = hi0 ^ ctr[3] ^ key[1];
+    ret[2] = hi0 ^ ctr[3] ^ in_key[1];
     ret[3] = lo0;
     return ret;
   }

From b66df47a11c21f31559101387b4b50302192e4fa Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Mon, 7 Oct 2019 16:58:27 -0700
Subject: [PATCH 098/113] Refactor python_android test to separate
 Android-specific components (#27453)

Summary:
All of the test cases move into a base class that is extended by the
intrumentation test and a new "HostTests" class that can be run in
normal Java.  (Some changes to the build script and dependencies are
required before the host test can actually run.)

ghstack-source-id: fe1165b513241b92c5f4a81447f5e184b3bfc75e
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27453

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D17800410

fbshipit-source-id: 1184f0caebdfa219f4ccd1464c67826ac0220181
---
 .../java/org/pytorch/PytorchHostTests.java    |  30 ++
 .../org/pytorch/PytorchInstrumentedTests.java | 287 +----------------
 .../java/org/pytorch/PytorchTestBase.java     | 290 ++++++++++++++++++
 3 files changed, 323 insertions(+), 284 deletions(-)
 create mode 100644 android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
 create mode 100644 android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java

diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
new file mode 100644
index 0000000000000..47367180b6bc0
--- /dev/null
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchHostTests.java
@@ -0,0 +1,30 @@
+package org.pytorch;
+
+import com.facebook.soloader.nativeloader.NativeLoader;
+import com.facebook.soloader.nativeloader.SystemDelegate;
+import org.junit.BeforeClass;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.Objects;
+
+public class PytorchHostTests extends PytorchTestBase {
+  @BeforeClass
+  public static void setUpClass() {
+    if (!NativeLoader.isInitialized()) {
+      NativeLoader.init(new SystemDelegate());
+    }
+  }
+
+  @Override
+  protected String assetFilePath(String assetName) throws IOException {
+    Path tempFile = Files.createTempFile("test", ".pt");
+    try (InputStream resource = Objects.requireNonNull(getClass().getClassLoader().getResourceAsStream("test.pt"))) {
+      Files.copy(resource, tempFile, StandardCopyOption.REPLACE_EXISTING);
+    }
+    return tempFile.toAbsolutePath().toString();
+  }
+}
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
index 7eed05c4b1fb6..43a6c4f366777 100644
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchInstrumentedTests.java
@@ -2,8 +2,6 @@
 
 import android.content.Context;
 
-import org.junit.Before;
-import org.junit.Test;
 import org.junit.runner.RunWith;
 
 import java.io.File;
@@ -11,294 +9,15 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
-import java.util.HashMap;
-import java.util.Map;
 
 import androidx.test.ext.junit.runners.AndroidJUnit4;
 import androidx.test.platform.app.InstrumentationRegistry;
 
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-
 @RunWith(AndroidJUnit4.class)
-public class PytorchInstrumentedTests {
-
-  private static final String TEST_MODULE_ASSET_NAME = "test.pt";
-
-  @Before
-  public void setUp() {
-    System.loadLibrary("pytorch");
-  }
-
-  @Test
-  public void testForwardNull() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    final IValue input =
-        IValue.tensor(Tensor.newInt8Tensor(new long[] {1}, Tensor.allocateByteBuffer(1)));
-    assertTrue(input.isTensor());
-    final IValue output = module.forward(input);
-    assertTrue(output.isNull());
-  }
-
-  @Test
-  public void testEqBool() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    for (boolean value : new boolean[] {false, true}) {
-      final IValue input = IValue.bool(value);
-      assertTrue(input.isBool());
-      assertTrue(value == input.getBool());
-      final IValue output = module.runMethod("eqBool", input);
-      assertTrue(output.isBool());
-      assertTrue(value == output.getBool());
-    }
-  }
-
-  @Test
-  public void testEqInt() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) {
-      final IValue input = IValue.long64(value);
-      assertTrue(input.isLong());
-      assertTrue(value == input.getLong());
-      final IValue output = module.runMethod("eqInt", input);
-      assertTrue(output.isLong());
-      assertTrue(value == output.getLong());
-    }
-  }
-
-  @Test
-  public void testEqFloat() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    double[] values =
-        new double[] {
-          -Double.MAX_VALUE,
-          Double.MAX_VALUE,
-          -Double.MIN_VALUE,
-          Double.MIN_VALUE,
-          -Math.exp(1.d),
-          -Math.sqrt(2.d),
-          -3.1415f,
-          3.1415f,
-          -1,
-          0,
-          1,
-        };
-    for (double value : values) {
-      final IValue input = IValue.double64(value);
-      assertTrue(input.isDouble());
-      assertTrue(value == input.getDouble());
-      final IValue output = module.runMethod("eqFloat", input);
-      assertTrue(output.isDouble());
-      assertTrue(value == output.getDouble());
-    }
-  }
-
-  @Test
-  public void testEqTensor() throws IOException {
-    final long[] inputTensorShape = new long[] {1, 3, 224, 224};
-    final long numElements = Tensor.numel(inputTensorShape);
-    final float[] inputTensorData = new float[(int) numElements];
-    for (int i = 0; i < numElements; ++i) {
-      inputTensorData[i] = i;
-    }
-    final Tensor inputTensor = Tensor.newFloat32Tensor(inputTensorShape, inputTensorData);
-
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    final IValue input = IValue.tensor(inputTensor);
-    assertTrue(input.isTensor());
-    assertTrue(inputTensor == input.getTensor());
-    final IValue output = module.runMethod("eqTensor", input);
-    assertTrue(output.isTensor());
-    final Tensor outputTensor = output.getTensor();
-    assertNotNull(outputTensor);
-    assertArrayEquals(inputTensorShape, outputTensor.shape);
-    float[] outputData = outputTensor.getDataAsFloatArray();
-    for (int i = 0; i < numElements; i++) {
-      assertTrue(inputTensorData[i] == outputData[i]);
-    }
-  }
-
-  @Test
-  public void testEqDictIntKeyIntValue() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    final Map<Long, IValue> inputMap = new HashMap<>();
-
-    inputMap.put(Long.MIN_VALUE, IValue.long64(-Long.MIN_VALUE));
-    inputMap.put(Long.MAX_VALUE, IValue.long64(-Long.MAX_VALUE));
-    inputMap.put(0l, IValue.long64(0l));
-    inputMap.put(1l, IValue.long64(-1l));
-    inputMap.put(-1l, IValue.long64(1l));
-
-    final IValue input = IValue.dictLongKey(inputMap);
-    assertTrue(input.isDictLongKey());
-
-    final IValue output = module.runMethod("eqDictIntKeyIntValue", input);
-    assertTrue(output.isDictLongKey());
-
-    final Map<Long, IValue> outputMap = output.getDictLongKey();
-    assertTrue(inputMap.size() == outputMap.size());
-    for (Map.Entry<Long, IValue> entry : inputMap.entrySet()) {
-      assertTrue(outputMap.get(entry.getKey()).getLong() == entry.getValue().getLong());
-    }
-  }
-
-  @Test
-  public void testEqDictStrKeyIntValue() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    final Map<String, IValue> inputMap = new HashMap<>();
-
-    inputMap.put("long_min_value", IValue.long64(Long.MIN_VALUE));
-    inputMap.put("long_max_value", IValue.long64(Long.MAX_VALUE));
-    inputMap.put("long_0", IValue.long64(0l));
-    inputMap.put("long_1", IValue.long64(1l));
-    inputMap.put("long_-1", IValue.long64(-1l));
-
-    final IValue input = IValue.dictStringKey(inputMap);
-    assertTrue(input.isDictStringKey());
-
-    final IValue output = module.runMethod("eqDictStrKeyIntValue", input);
-    assertTrue(output.isDictStringKey());
-
-    final Map<String, IValue> outputMap = output.getDictStringKey();
-    assertTrue(inputMap.size() == outputMap.size());
-    for (Map.Entry<String, IValue> entry : inputMap.entrySet()) {
-      assertTrue(outputMap.get(entry.getKey()).getLong() == entry.getValue().getLong());
-    }
-  }
-
-  @Test
-  public void testListIntSumReturnTuple() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-
-    for (int n : new int[] {0, 1, 128}) {
-      long[] a = new long[n];
-      long sum = 0;
-      for (int i = 0; i < n; i++) {
-        a[i] = i;
-        sum += a[i];
-      }
-      final IValue input = IValue.longList(a);
-      assertTrue(input.isLongList());
-
-      final IValue output = module.runMethod("listIntSumReturnTuple", input);
-
-      assertTrue(output.isTuple());
-      assertTrue(2 == output.getTuple().length);
-
-      IValue output0 = output.getTuple()[0];
-      IValue output1 = output.getTuple()[1];
-
-      assertArrayEquals(a, output0.getLongList());
-      assertTrue(sum == output1.getLong());
-    }
-  }
-
-  @Test
-  public void testOptionalIntIsNone() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-
-    assertFalse(module.runMethod("optionalIntIsNone", IValue.long64(1l)).getBool());
-    assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).getBool());
-  }
-
-  @Test
-  public void testIntEq0None() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-
-    assertTrue(module.runMethod("intEq0None", IValue.long64(0l)).isNull());
-    assertTrue(module.runMethod("intEq0None", IValue.long64(1l)).getLong() == 1l);
-  }
-
-  @Test(expected = IllegalArgumentException.class)
-  public void testRunUndefinedMethod() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    module.runMethod("test_undefined_method_throws_exception");
-  }
-
-  @Test
-  public void testTensorMethods() {
-    long[] shape = new long[] {1, 3, 224, 224};
-    final int numel = (int) Tensor.numel(shape);
-    int[] ints = new int[numel];
-    float[] floats = new float[numel];
-
-    byte[] bytes = new byte[numel];
-    for (int i = 0; i < numel; i++) {
-      bytes[i] = (byte) ((i % 255) - 128);
-      ints[i] = i;
-      floats[i] = i / 1000.f;
-    }
-
-    Tensor tensorBytes = Tensor.newInt8Tensor(shape, bytes);
-    assertTrue(tensorBytes.dtype() == Tensor.DTYPE_INT8);
-    assertArrayEquals(bytes, tensorBytes.getDataAsByteArray());
-
-    Tensor tensorInts = Tensor.newInt32Tensor(shape, ints);
-    assertTrue(tensorInts.dtype() == Tensor.DTYPE_INT32);
-    assertArrayEquals(ints, tensorInts.getDataAsIntArray());
-
-    Tensor tensorFloats = Tensor.newFloat32Tensor(shape, floats);
-    assertTrue(tensorFloats.dtype() == Tensor.DTYPE_FLOAT32);
-    float[] floatsOut = tensorFloats.getDataAsFloatArray();
-    assertTrue(floatsOut.length == numel);
-    for (int i = 0; i < numel; i++) {
-      assertTrue(floats[i] == floatsOut[i]);
-    }
-  }
-
-  @Test(expected = IllegalStateException.class)
-  public void testTensorIllegalStateOnWrongType() {
-    long[] shape = new long[] {1, 3, 224, 224};
-    final int numel = (int) Tensor.numel(shape);
-    float[] floats = new float[numel];
-    Tensor tensorFloats = Tensor.newFloat32Tensor(shape, floats);
-    assertTrue(tensorFloats.dtype() == Tensor.DTYPE_FLOAT32);
-    tensorFloats.getDataAsByteArray();
-  }
-
-
-  @Test
-  public void testEqString() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    String[] values =
-        new String[] {
-            "smoketest",
-            "проверка не латинских символов", // not latin symbols check
-            "#@$!@#)($*!@#$)(!@*#$"
-        };
-    for (String value : values) {
-      final IValue input = IValue.string(value);
-      assertTrue(input.isString());
-      assertTrue(value.equals(input.getString()));
-      final IValue output = module.runMethod("eqStr", input);
-      assertTrue(output.isString());
-      assertTrue(value.equals(output.getString()));
-    }
-  }
-
-  @Test
-  public void testStr3Concat() throws IOException {
-    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    String[] values =
-        new String[] {
-            "smoketest",
-            "проверка не латинских символов", // not latin symbols check
-            "#@$!@#)($*!@#$)(!@*#$"
-        };
-    for (String value : values) {
-      final IValue input = IValue.string(value);
-      assertTrue(input.isString());
-      assertTrue(value.equals(input.getString()));
-      final IValue output = module.runMethod("str3Concat", input);
-      assertTrue(output.isString());
-      String expectedOutput = new StringBuilder().append(value).append(value).append(value).toString();
-      assertTrue(expectedOutput.equals(output.getString()));
-    }
-  }
+public class PytorchInstrumentedTests extends PytorchTestBase {
 
-  private static String assetFilePath(String assetName) throws IOException {
+  @Override
+  protected String assetFilePath(String assetName) throws IOException {
     final Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
     File file = new File(appContext.getFilesDir(), assetName);
     if (file.exists() && file.length() > 0) {
diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
new file mode 100644
index 0000000000000..544171a1aacd4
--- /dev/null
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
@@ -0,0 +1,290 @@
+package org.pytorch;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+public abstract class PytorchTestBase {
+  private static final String TEST_MODULE_ASSET_NAME = "test.pt";
+
+  @Before
+  public void setUp() {
+    System.loadLibrary("pytorch");
+  }
+
+  @Test
+  public void testForwardNull() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final IValue input =
+        IValue.tensor(Tensor.newInt8Tensor(new long[] {1}, Tensor.allocateByteBuffer(1)));
+    assertTrue(input.isTensor());
+    final IValue output = module.forward(input);
+    assertTrue(output.isNull());
+  }
+
+  @Test
+  public void testEqBool() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    for (boolean value : new boolean[] {false, true}) {
+      final IValue input = IValue.bool(value);
+      assertTrue(input.isBool());
+      assertTrue(value == input.getBool());
+      final IValue output = module.runMethod("eqBool", input);
+      assertTrue(output.isBool());
+      assertTrue(value == output.getBool());
+    }
+  }
+
+  @Test
+  public void testEqInt() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) {
+      final IValue input = IValue.long64(value);
+      assertTrue(input.isLong());
+      assertTrue(value == input.getLong());
+      final IValue output = module.runMethod("eqInt", input);
+      assertTrue(output.isLong());
+      assertTrue(value == output.getLong());
+    }
+  }
+
+  @Test
+  public void testEqFloat() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    double[] values =
+        new double[] {
+            -Double.MAX_VALUE,
+            Double.MAX_VALUE,
+            -Double.MIN_VALUE,
+            Double.MIN_VALUE,
+            -Math.exp(1.d),
+            -Math.sqrt(2.d),
+            -3.1415f,
+            3.1415f,
+            -1,
+            0,
+            1,
+        };
+    for (double value : values) {
+      final IValue input = IValue.double64(value);
+      assertTrue(input.isDouble());
+      assertTrue(value == input.getDouble());
+      final IValue output = module.runMethod("eqFloat", input);
+      assertTrue(output.isDouble());
+      assertTrue(value == output.getDouble());
+    }
+  }
+
+  @Test
+  public void testEqTensor() throws IOException {
+    final long[] inputTensorShape = new long[] {1, 3, 224, 224};
+    final long numElements = Tensor.numel(inputTensorShape);
+    final float[] inputTensorData = new float[(int) numElements];
+    for (int i = 0; i < numElements; ++i) {
+      inputTensorData[i] = i;
+    }
+    final Tensor inputTensor = Tensor.newFloat32Tensor(inputTensorShape, inputTensorData);
+
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final IValue input = IValue.tensor(inputTensor);
+    assertTrue(input.isTensor());
+    assertTrue(inputTensor == input.getTensor());
+    final IValue output = module.runMethod("eqTensor", input);
+    assertTrue(output.isTensor());
+    final Tensor outputTensor = output.getTensor();
+    assertNotNull(outputTensor);
+    assertArrayEquals(inputTensorShape, outputTensor.shape);
+    float[] outputData = outputTensor.getDataAsFloatArray();
+    for (int i = 0; i < numElements; i++) {
+      assertTrue(inputTensorData[i] == outputData[i]);
+    }
+  }
+
+  @Test
+  public void testEqDictIntKeyIntValue() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Map<Long, IValue> inputMap = new HashMap<>();
+
+    inputMap.put(Long.MIN_VALUE, IValue.long64(-Long.MIN_VALUE));
+    inputMap.put(Long.MAX_VALUE, IValue.long64(-Long.MAX_VALUE));
+    inputMap.put(0l, IValue.long64(0l));
+    inputMap.put(1l, IValue.long64(-1l));
+    inputMap.put(-1l, IValue.long64(1l));
+
+    final IValue input = IValue.dictLongKey(inputMap);
+    assertTrue(input.isDictLongKey());
+
+    final IValue output = module.runMethod("eqDictIntKeyIntValue", input);
+    assertTrue(output.isDictLongKey());
+
+    final Map<Long, IValue> outputMap = output.getDictLongKey();
+    assertTrue(inputMap.size() == outputMap.size());
+    for (Map.Entry<Long, IValue> entry : inputMap.entrySet()) {
+      assertTrue(outputMap.get(entry.getKey()).getLong() == entry.getValue().getLong());
+    }
+  }
+
+  @Test
+  public void testEqDictStrKeyIntValue() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    final Map<String, IValue> inputMap = new HashMap<>();
+
+    inputMap.put("long_min_value", IValue.long64(Long.MIN_VALUE));
+    inputMap.put("long_max_value", IValue.long64(Long.MAX_VALUE));
+    inputMap.put("long_0", IValue.long64(0l));
+    inputMap.put("long_1", IValue.long64(1l));
+    inputMap.put("long_-1", IValue.long64(-1l));
+
+    final IValue input = IValue.dictStringKey(inputMap);
+    assertTrue(input.isDictStringKey());
+
+    final IValue output = module.runMethod("eqDictStrKeyIntValue", input);
+    assertTrue(output.isDictStringKey());
+
+    final Map<String, IValue> outputMap = output.getDictStringKey();
+    assertTrue(inputMap.size() == outputMap.size());
+    for (Map.Entry<String, IValue> entry : inputMap.entrySet()) {
+      assertTrue(outputMap.get(entry.getKey()).getLong() == entry.getValue().getLong());
+    }
+  }
+
+  @Test
+  public void testListIntSumReturnTuple() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+
+    for (int n : new int[] {0, 1, 128}) {
+      long[] a = new long[n];
+      long sum = 0;
+      for (int i = 0; i < n; i++) {
+        a[i] = i;
+        sum += a[i];
+      }
+      final IValue input = IValue.longList(a);
+      assertTrue(input.isLongList());
+
+      final IValue output = module.runMethod("listIntSumReturnTuple", input);
+
+      assertTrue(output.isTuple());
+      assertTrue(2 == output.getTuple().length);
+
+      IValue output0 = output.getTuple()[0];
+      IValue output1 = output.getTuple()[1];
+
+      assertArrayEquals(a, output0.getLongList());
+      assertTrue(sum == output1.getLong());
+    }
+  }
+
+  @Test
+  public void testOptionalIntIsNone() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+
+    assertFalse(module.runMethod("optionalIntIsNone", IValue.long64(1l)).getBool());
+    assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).getBool());
+  }
+
+  @Test
+  public void testIntEq0None() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+
+    assertTrue(module.runMethod("intEq0None", IValue.long64(0l)).isNull());
+    assertTrue(module.runMethod("intEq0None", IValue.long64(1l)).getLong() == 1l);
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void testRunUndefinedMethod() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    module.runMethod("test_undefined_method_throws_exception");
+  }
+
+  @Test
+  public void testTensorMethods() {
+    long[] shape = new long[] {1, 3, 224, 224};
+    final int numel = (int) Tensor.numel(shape);
+    int[] ints = new int[numel];
+    float[] floats = new float[numel];
+
+    byte[] bytes = new byte[numel];
+    for (int i = 0; i < numel; i++) {
+      bytes[i] = (byte) ((i % 255) - 128);
+      ints[i] = i;
+      floats[i] = i / 1000.f;
+    }
+
+    Tensor tensorBytes = Tensor.newInt8Tensor(shape, bytes);
+    assertTrue(tensorBytes.dtype() == Tensor.DTYPE_INT8);
+    assertArrayEquals(bytes, tensorBytes.getDataAsByteArray());
+
+    Tensor tensorInts = Tensor.newInt32Tensor(shape, ints);
+    assertTrue(tensorInts.dtype() == Tensor.DTYPE_INT32);
+    assertArrayEquals(ints, tensorInts.getDataAsIntArray());
+
+    Tensor tensorFloats = Tensor.newFloat32Tensor(shape, floats);
+    assertTrue(tensorFloats.dtype() == Tensor.DTYPE_FLOAT32);
+    float[] floatsOut = tensorFloats.getDataAsFloatArray();
+    assertTrue(floatsOut.length == numel);
+    for (int i = 0; i < numel; i++) {
+      assertTrue(floats[i] == floatsOut[i]);
+    }
+  }
+
+  @Test(expected = IllegalStateException.class)
+  public void testTensorIllegalStateOnWrongType() {
+    long[] shape = new long[] {1, 3, 224, 224};
+    final int numel = (int) Tensor.numel(shape);
+    float[] floats = new float[numel];
+    Tensor tensorFloats = Tensor.newFloat32Tensor(shape, floats);
+    assertTrue(tensorFloats.dtype() == Tensor.DTYPE_FLOAT32);
+    tensorFloats.getDataAsByteArray();
+  }
+
+
+  @Test
+  public void testEqString() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    String[] values =
+        new String[] {
+            "smoketest",
+            "проверка не латинских символов", // not latin symbols check
+            "#@$!@#)($*!@#$)(!@*#$"
+        };
+    for (String value : values) {
+      final IValue input = IValue.string(value);
+      assertTrue(input.isString());
+      assertTrue(value.equals(input.getString()));
+      final IValue output = module.runMethod("eqStr", input);
+      assertTrue(output.isString());
+      assertTrue(value.equals(output.getString()));
+    }
+  }
+
+  @Test
+  public void testStr3Concat() throws IOException {
+    final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
+    String[] values =
+        new String[] {
+            "smoketest",
+            "проверка не латинских символов", // not latin symbols check
+            "#@$!@#)($*!@#$)(!@*#$"
+        };
+    for (String value : values) {
+      final IValue input = IValue.string(value);
+      assertTrue(input.isString());
+      assertTrue(value.equals(input.getString()));
+      final IValue output = module.runMethod("str3Concat", input);
+      assertTrue(output.isString());
+      String expectedOutput = new StringBuilder().append(value).append(value).append(value).toString();
+      assertTrue(expectedOutput.equals(output.getString()));
+    }
+  }
+
+  protected abstract String assetFilePath(String assetName) throws IOException;
+}

From 1ffa81d772e2e85648537fe6db4e247b264811a0 Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Mon, 7 Oct 2019 16:58:27 -0700
Subject: [PATCH 099/113] Various cleanups to pytorch_android API (#27454)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27454

See detailed discussion at
https://github.com/pytorch/pytorch/issues/27350

Test Plan: Imported from OSS

Reviewed By: IvanKobzarev

Differential Revision: D17800480

Pulled By: dreiss

fbshipit-source-id: bf174e8b16231b89be771de0fa54c41e864a3eb0
---
 .../java/org/pytorch/PytorchTestBase.java     | 110 ++++++++--------
 .../src/main/cpp/pytorch_jni.cpp              |  56 ++++----
 .../src/main/java/org/pytorch/DType.java      |  29 ++++
 .../src/main/java/org/pytorch/IValue.java     |  52 ++++----
 .../src/main/java/org/pytorch/Tensor.java     | 124 ++++++++----------
 .../pytorch/torchvision/TensorImageUtils.java |   4 +-
 6 files changed, 199 insertions(+), 176 deletions(-)
 create mode 100644 android/pytorch_android/src/main/java/org/pytorch/DType.java

diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
index 544171a1aacd4..a1cca0adddda2 100644
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
@@ -24,7 +24,7 @@ public void setUp() {
   public void testForwardNull() throws IOException {
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
     final IValue input =
-        IValue.tensor(Tensor.newInt8Tensor(new long[] {1}, Tensor.allocateByteBuffer(1)));
+        IValue.from(Tensor.fromBlob(Tensor.allocateByteBuffer(1), new long[] {1}));
     assertTrue(input.isTensor());
     final IValue output = module.forward(input);
     assertTrue(output.isNull());
@@ -34,12 +34,12 @@ public void testForwardNull() throws IOException {
   public void testEqBool() throws IOException {
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
     for (boolean value : new boolean[] {false, true}) {
-      final IValue input = IValue.bool(value);
+      final IValue input = IValue.from(value);
       assertTrue(input.isBool());
-      assertTrue(value == input.getBool());
+      assertTrue(value == input.toBool());
       final IValue output = module.runMethod("eqBool", input);
       assertTrue(output.isBool());
-      assertTrue(value == output.getBool());
+      assertTrue(value == output.toBool());
     }
   }
 
@@ -47,12 +47,12 @@ public void testEqBool() throws IOException {
   public void testEqInt() throws IOException {
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
     for (long value : new long[] {Long.MIN_VALUE, -1024, -1, 0, 1, 1024, Long.MAX_VALUE}) {
-      final IValue input = IValue.long64(value);
+      final IValue input = IValue.from(value);
       assertTrue(input.isLong());
-      assertTrue(value == input.getLong());
+      assertTrue(value == input.toLong());
       final IValue output = module.runMethod("eqInt", input);
       assertTrue(output.isLong());
-      assertTrue(value == output.getLong());
+      assertTrue(value == output.toLong());
     }
   }
 
@@ -74,12 +74,12 @@ public void testEqFloat() throws IOException {
             1,
         };
     for (double value : values) {
-      final IValue input = IValue.double64(value);
+      final IValue input = IValue.from(value);
       assertTrue(input.isDouble());
-      assertTrue(value == input.getDouble());
+      assertTrue(value == input.toDouble());
       final IValue output = module.runMethod("eqFloat", input);
       assertTrue(output.isDouble());
-      assertTrue(value == output.getDouble());
+      assertTrue(value == output.toDouble());
     }
   }
 
@@ -91,17 +91,17 @@ public void testEqTensor() throws IOException {
     for (int i = 0; i < numElements; ++i) {
       inputTensorData[i] = i;
     }
-    final Tensor inputTensor = Tensor.newFloat32Tensor(inputTensorShape, inputTensorData);
+    final Tensor inputTensor = Tensor.fromBlob(inputTensorData, inputTensorShape);
 
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
-    final IValue input = IValue.tensor(inputTensor);
+    final IValue input = IValue.from(inputTensor);
     assertTrue(input.isTensor());
-    assertTrue(inputTensor == input.getTensor());
+    assertTrue(inputTensor == input.toTensor());
     final IValue output = module.runMethod("eqTensor", input);
     assertTrue(output.isTensor());
-    final Tensor outputTensor = output.getTensor();
+    final Tensor outputTensor = output.toTensor();
     assertNotNull(outputTensor);
-    assertArrayEquals(inputTensorShape, outputTensor.shape);
+    assertArrayEquals(inputTensorShape, outputTensor.shape());
     float[] outputData = outputTensor.getDataAsFloatArray();
     for (int i = 0; i < numElements; i++) {
       assertTrue(inputTensorData[i] == outputData[i]);
@@ -113,22 +113,22 @@ public void testEqDictIntKeyIntValue() throws IOException {
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
     final Map<Long, IValue> inputMap = new HashMap<>();
 
-    inputMap.put(Long.MIN_VALUE, IValue.long64(-Long.MIN_VALUE));
-    inputMap.put(Long.MAX_VALUE, IValue.long64(-Long.MAX_VALUE));
-    inputMap.put(0l, IValue.long64(0l));
-    inputMap.put(1l, IValue.long64(-1l));
-    inputMap.put(-1l, IValue.long64(1l));
+    inputMap.put(Long.MIN_VALUE, IValue.from(-Long.MIN_VALUE));
+    inputMap.put(Long.MAX_VALUE, IValue.from(-Long.MAX_VALUE));
+    inputMap.put(0l, IValue.from(0l));
+    inputMap.put(1l, IValue.from(-1l));
+    inputMap.put(-1l, IValue.from(1l));
 
-    final IValue input = IValue.dictLongKey(inputMap);
+    final IValue input = IValue.dictLongKeyFrom(inputMap);
     assertTrue(input.isDictLongKey());
 
     final IValue output = module.runMethod("eqDictIntKeyIntValue", input);
     assertTrue(output.isDictLongKey());
 
-    final Map<Long, IValue> outputMap = output.getDictLongKey();
+    final Map<Long, IValue> outputMap = output.toDictLongKey();
     assertTrue(inputMap.size() == outputMap.size());
     for (Map.Entry<Long, IValue> entry : inputMap.entrySet()) {
-      assertTrue(outputMap.get(entry.getKey()).getLong() == entry.getValue().getLong());
+      assertTrue(outputMap.get(entry.getKey()).toLong() == entry.getValue().toLong());
     }
   }
 
@@ -137,22 +137,22 @@ public void testEqDictStrKeyIntValue() throws IOException {
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
     final Map<String, IValue> inputMap = new HashMap<>();
 
-    inputMap.put("long_min_value", IValue.long64(Long.MIN_VALUE));
-    inputMap.put("long_max_value", IValue.long64(Long.MAX_VALUE));
-    inputMap.put("long_0", IValue.long64(0l));
-    inputMap.put("long_1", IValue.long64(1l));
-    inputMap.put("long_-1", IValue.long64(-1l));
+    inputMap.put("long_min_value", IValue.from(Long.MIN_VALUE));
+    inputMap.put("long_max_value", IValue.from(Long.MAX_VALUE));
+    inputMap.put("long_0", IValue.from(0l));
+    inputMap.put("long_1", IValue.from(1l));
+    inputMap.put("long_-1", IValue.from(-1l));
 
-    final IValue input = IValue.dictStringKey(inputMap);
+    final IValue input = IValue.dictStringKeyFrom(inputMap);
     assertTrue(input.isDictStringKey());
 
     final IValue output = module.runMethod("eqDictStrKeyIntValue", input);
     assertTrue(output.isDictStringKey());
 
-    final Map<String, IValue> outputMap = output.getDictStringKey();
+    final Map<String, IValue> outputMap = output.toDictStringKey();
     assertTrue(inputMap.size() == outputMap.size());
     for (Map.Entry<String, IValue> entry : inputMap.entrySet()) {
-      assertTrue(outputMap.get(entry.getKey()).getLong() == entry.getValue().getLong());
+      assertTrue(outputMap.get(entry.getKey()).toLong() == entry.getValue().toLong());
     }
   }
 
@@ -167,19 +167,19 @@ public void testListIntSumReturnTuple() throws IOException {
         a[i] = i;
         sum += a[i];
       }
-      final IValue input = IValue.longList(a);
+      final IValue input = IValue.listFrom(a);
       assertTrue(input.isLongList());
 
       final IValue output = module.runMethod("listIntSumReturnTuple", input);
 
       assertTrue(output.isTuple());
-      assertTrue(2 == output.getTuple().length);
+      assertTrue(2 == output.toTuple().length);
 
-      IValue output0 = output.getTuple()[0];
-      IValue output1 = output.getTuple()[1];
+      IValue output0 = output.toTuple()[0];
+      IValue output1 = output.toTuple()[1];
 
-      assertArrayEquals(a, output0.getLongList());
-      assertTrue(sum == output1.getLong());
+      assertArrayEquals(a, output0.toLongList());
+      assertTrue(sum == output1.toLong());
     }
   }
 
@@ -187,16 +187,16 @@ public void testListIntSumReturnTuple() throws IOException {
   public void testOptionalIntIsNone() throws IOException {
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
 
-    assertFalse(module.runMethod("optionalIntIsNone", IValue.long64(1l)).getBool());
-    assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).getBool());
+    assertFalse(module.runMethod("optionalIntIsNone", IValue.from(1l)).toBool());
+    assertTrue(module.runMethod("optionalIntIsNone", IValue.optionalNull()).toBool());
   }
 
   @Test
   public void testIntEq0None() throws IOException {
     final Module module = Module.load(assetFilePath(TEST_MODULE_ASSET_NAME));
 
-    assertTrue(module.runMethod("intEq0None", IValue.long64(0l)).isNull());
-    assertTrue(module.runMethod("intEq0None", IValue.long64(1l)).getLong() == 1l);
+    assertTrue(module.runMethod("intEq0None", IValue.from(0l)).isNull());
+    assertTrue(module.runMethod("intEq0None", IValue.from(1l)).toLong() == 1l);
   }
 
   @Test(expected = IllegalArgumentException.class)
@@ -219,16 +219,16 @@ public void testTensorMethods() {
       floats[i] = i / 1000.f;
     }
 
-    Tensor tensorBytes = Tensor.newInt8Tensor(shape, bytes);
-    assertTrue(tensorBytes.dtype() == Tensor.DTYPE_INT8);
+    Tensor tensorBytes = Tensor.fromBlob(bytes, shape);
+    assertTrue(tensorBytes.dtype() == DType.INT8);
     assertArrayEquals(bytes, tensorBytes.getDataAsByteArray());
 
-    Tensor tensorInts = Tensor.newInt32Tensor(shape, ints);
-    assertTrue(tensorInts.dtype() == Tensor.DTYPE_INT32);
+    Tensor tensorInts = Tensor.fromBlob(ints, shape);
+    assertTrue(tensorInts.dtype() == DType.INT32);
     assertArrayEquals(ints, tensorInts.getDataAsIntArray());
 
-    Tensor tensorFloats = Tensor.newFloat32Tensor(shape, floats);
-    assertTrue(tensorFloats.dtype() == Tensor.DTYPE_FLOAT32);
+    Tensor tensorFloats = Tensor.fromBlob(floats, shape);
+    assertTrue(tensorFloats.dtype() == DType.FLOAT32);
     float[] floatsOut = tensorFloats.getDataAsFloatArray();
     assertTrue(floatsOut.length == numel);
     for (int i = 0; i < numel; i++) {
@@ -241,8 +241,8 @@ public void testTensorIllegalStateOnWrongType() {
     long[] shape = new long[] {1, 3, 224, 224};
     final int numel = (int) Tensor.numel(shape);
     float[] floats = new float[numel];
-    Tensor tensorFloats = Tensor.newFloat32Tensor(shape, floats);
-    assertTrue(tensorFloats.dtype() == Tensor.DTYPE_FLOAT32);
+    Tensor tensorFloats = Tensor.fromBlob(floats, shape);
+    assertTrue(tensorFloats.dtype() == DType.FLOAT32);
     tensorFloats.getDataAsByteArray();
   }
 
@@ -257,12 +257,12 @@ public void testEqString() throws IOException {
             "#@$!@#)($*!@#$)(!@*#$"
         };
     for (String value : values) {
-      final IValue input = IValue.string(value);
+      final IValue input = IValue.from(value);
       assertTrue(input.isString());
-      assertTrue(value.equals(input.getString()));
+      assertTrue(value.equals(input.toStr()));
       final IValue output = module.runMethod("eqStr", input);
       assertTrue(output.isString());
-      assertTrue(value.equals(output.getString()));
+      assertTrue(value.equals(output.toStr()));
     }
   }
 
@@ -276,13 +276,13 @@ public void testStr3Concat() throws IOException {
             "#@$!@#)($*!@#$)(!@*#$"
         };
     for (String value : values) {
-      final IValue input = IValue.string(value);
+      final IValue input = IValue.from(value);
       assertTrue(input.isString());
-      assertTrue(value.equals(input.getString()));
+      assertTrue(value.equals(input.toStr()));
       final IValue output = module.runMethod("str3Concat", input);
       assertTrue(output.isString());
       String expectedOutput = new StringBuilder().append(value).append(value).append(value).toString();
-      assertTrue(expectedOutput.equals(output.getString()));
+      assertTrue(expectedOutput.equals(output.toStr()));
     }
   }
 
diff --git a/android/pytorch_android/src/main/cpp/pytorch_jni.cpp b/android/pytorch_android/src/main/cpp/pytorch_jni.cpp
index bb00e47087422..7f09b51a5151b 100644
--- a/android/pytorch_android/src/main/cpp/pytorch_jni.cpp
+++ b/android/pytorch_android/src/main/cpp/pytorch_jni.cpp
@@ -10,6 +10,8 @@
 
 namespace pytorch_jni {
 
+// NOTE: Codes must be kept in sync with DType.java.
+// NOTE: Never serialize these, because they can change between releases.
 constexpr static int kTensorDTypeUInt8 = 1;
 constexpr static int kTensorDTypeInt8 = 2;
 constexpr static int kTensorDTypeInt32 = 3;
@@ -164,7 +166,7 @@ class JTensor : public facebook::jni::JavaClass<JTensor> {
   static at::Tensor newAtTensorFromJTensor(
       facebook::jni::alias_ref<JTensor> jtensor) {
     static const auto dtypeMethod =
-        JTensor::javaClassStatic()->getMethod<jint()>("dtype");
+        JTensor::javaClassStatic()->getMethod<jint()>("dtypeJniCode");
     jint jdtype = dtypeMethod(jtensor);
 
     static const auto shapeField =
@@ -216,7 +218,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static auto jMethodTensor =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
-                  facebook::jni::local_ref<JTensor>)>("tensor");
+                  facebook::jni::local_ref<JTensor>)>("from");
       return jMethodTensor(
           JIValue::javaClassStatic(),
           JTensor::newJTensorFromAtTensor(ivalue.toTensor()));
@@ -224,26 +226,26 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static auto jMethodBool =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(jboolean)>(
-                  "bool");
+                  "from");
       return jMethodBool(JIValue::javaClassStatic(), ivalue.toBool());
     } else if (ivalue.isInt()) {
       static auto jMethodInt =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(jlong)>(
-                  "long64");
+                  "from");
       return jMethodInt(JIValue::javaClassStatic(), ivalue.toInt());
     } else if (ivalue.isDouble()) {
       static auto jMethodDouble =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(jdouble)>(
-                  "double64");
+                  "from");
       return jMethodDouble(JIValue::javaClassStatic(), ivalue.toDouble());
     } else if (ivalue.isString()) {
       static auto jMethodString =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
                   facebook::jni::alias_ref<
-                      facebook::jni::JString::javaobject>)>("string");
+                      facebook::jni::JString::javaobject>)>("from");
       return jMethodString(
           JIValue::javaClassStatic(),
           facebook::jni::make_jstring(ivalue.toStringRef()));
@@ -253,7 +255,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
                   facebook::jni::alias_ref<facebook::jni::JArrayClass<
-                      JIValue::javaobject>::javaobject>)>("tuple");
+                      JIValue::javaobject>::javaobject>)>("tupleFrom");
       auto jElementsArray =
           facebook::jni::JArrayClass<JIValue::javaobject>::newArray(
               elementsVec.size());
@@ -267,7 +269,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static auto jMethodBoolListArr =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
-                  facebook::jni::alias_ref<jbooleanArray>)>("boolList");
+                  facebook::jni::alias_ref<jbooleanArray>)>("listFrom");
       size_t n = list.size();
       auto jArray = facebook::jni::make_boolean_array(n);
       auto jArrayPinned = jArray->pin();
@@ -281,7 +283,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static auto jMethodLongListArr =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
-                  facebook::jni::alias_ref<jlongArray>)>("longList");
+                  facebook::jni::alias_ref<jlongArray>)>("listFrom");
       size_t n = list.size();
       auto jArray = facebook::jni::make_long_array(n);
       auto jArrayPinned = jArray->pin();
@@ -295,7 +297,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static auto jMethoDoubleListArr =
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
-                  facebook::jni::alias_ref<jdoubleArray>)>("doubleList");
+                  facebook::jni::alias_ref<jdoubleArray>)>("listFrom");
       size_t n = list.size();
       auto jArray = facebook::jni::make_double_array(n);
       auto jArrayPinned = jArray->pin();
@@ -310,7 +312,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
                   facebook::jni::alias_ref<facebook::jni::JArrayClass<
-                      JTensor::javaobject>::javaobject>)>("tensorList");
+                      JTensor::javaobject>::javaobject>)>("listFrom");
       auto jArray = facebook::jni::JArrayClass<JTensor::javaobject>::newArray(
           list.size());
       auto index = 0;
@@ -324,7 +326,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
           JIValue::javaClassStatic()
               ->getStaticMethod<facebook::jni::local_ref<JIValue>(
                   facebook::jni::alias_ref<facebook::jni::JArrayClass<
-                      JIValue::javaobject>::javaobject>)>("list");
+                      JIValue::javaobject>::javaobject>)>("listFrom");
       auto jArray = facebook::jni::JArrayClass<JIValue::javaobject>::newArray(
           list.size());
       auto index = 0;
@@ -351,7 +353,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
                         facebook::jni::alias_ref<
                             facebook::jni::JString::javaobject>,
                         facebook::jni::alias_ref<JIValue::javaobject>>>)>(
-                    "dictStringKey");
+                    "dictStringKeyFrom");
 
         auto jmap = JHashMap<
             facebook::jni::alias_ref<facebook::jni::JString::javaobject>,
@@ -370,7 +372,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
                         facebook::jni::alias_ref<
                             facebook::jni::JLong::javaobject>,
                         facebook::jni::alias_ref<JIValue::javaobject>>>)>(
-                    "dictLongKey");
+                    "dictLongKeyFrom");
         auto jmap = JHashMap<
             facebook::jni::alias_ref<facebook::jni::JLong::javaobject>,
             facebook::jni::alias_ref<JIValue::javaobject>>::create();
@@ -404,32 +406,32 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static const auto jMethodGetTensor =
           JIValue::javaClassStatic()
               ->getMethod<facebook::jni::alias_ref<JTensor::javaobject>()>(
-                  "getTensor");
+                  "toTensor");
       return JTensor::newAtTensorFromJTensor(jMethodGetTensor(jivalue));
     } else if (JIValue::kTypeCodeBool == typeCode) {
       static const auto jMethodGetBool =
-          JIValue::javaClassStatic()->getMethod<jboolean()>("getBool");
+          JIValue::javaClassStatic()->getMethod<jboolean()>("toBool");
       // explicit cast to bool as jboolean is defined as uint8_t, IValue ctor
       // for int will be called for jboolean
       bool b = jMethodGetBool(jivalue);
       return at::IValue{b};
     } else if (JIValue::kTypeCodeLong == typeCode) {
       static const auto jMethodGetLong =
-          JIValue::javaClassStatic()->getMethod<jlong()>("getLong");
+          JIValue::javaClassStatic()->getMethod<jlong()>("toLong");
       return at::IValue{jMethodGetLong(jivalue)};
     } else if (JIValue::kTypeCodeDouble == typeCode) {
       static const auto jMethodGetDouble =
-          JIValue::javaClassStatic()->getMethod<jdouble()>("getDouble");
+          JIValue::javaClassStatic()->getMethod<jdouble()>("toDouble");
       return at::IValue{jMethodGetDouble(jivalue)};
     } else if (JIValue::kTypeCodeString == typeCode) {
       static const auto jMethodGetString =
-          JIValue::javaClassStatic()->getMethod<jstring()>("getString");
+          JIValue::javaClassStatic()->getMethod<jstring()>("toStr");
       return at::IValue{jMethodGetString(jivalue)->toStdString()};
     } else if (JIValue::kTypeCodeTuple == typeCode) {
       static const auto jMethodGetTuple =
           JIValue::javaClassStatic()
               ->getMethod<facebook::jni::JArrayClass<
-                  JIValue::javaobject>::javaobject()>("getTuple");
+                  JIValue::javaobject>::javaobject()>("toTuple");
       auto jarray = jMethodGetTuple(jivalue);
       size_t n = jarray->size();
 
@@ -443,7 +445,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       return c10::ivalue::Tuple::create(std::move(elements));
     } else if (JIValue::kTypeCodeBoolList == typeCode) {
       static const auto jMethodGetBoolList =
-          JIValue::javaClassStatic()->getMethod<jbooleanArray()>("getBoolList");
+          JIValue::javaClassStatic()->getMethod<jbooleanArray()>("toBoolList");
       auto jArray = jMethodGetBoolList(jivalue);
       auto jArrayPinned = jArray->pin();
       size_t n = jArrayPinned.size();
@@ -455,7 +457,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       return at::IValue{std::move(list)};
     } else if (JIValue::kTypeCodeLongList == typeCode) {
       static const auto jMethodGetLongList =
-          JIValue::javaClassStatic()->getMethod<jlongArray()>("getLongList");
+          JIValue::javaClassStatic()->getMethod<jlongArray()>("toLongList");
       auto jArray = jMethodGetLongList(jivalue);
       auto jArrayPinned = jArray->pin();
       size_t n = jArrayPinned.size();
@@ -468,7 +470,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
     } else if (JIValue::kTypeCodeDoubleList == typeCode) {
       static const auto jMethodGetDoubleList =
           JIValue::javaClassStatic()->getMethod<jdoubleArray()>(
-              "getDoubleList");
+              "toDoubleList");
       auto jArray = jMethodGetDoubleList(jivalue);
       auto jArrayPinned = jArray->pin();
       size_t n = jArrayPinned.size();
@@ -482,7 +484,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static const auto jMethodGetTensorList =
           JIValue::javaClassStatic()
               ->getMethod<facebook::jni::JArrayClass<
-                  JTensor::javaobject>::javaobject()>("getTensorList");
+                  JTensor::javaobject>::javaobject()>("toTensorList");
       auto jArray = jMethodGetTensorList(jivalue);
       size_t n = jArray->size();
       c10::List<at::Tensor> list{};
@@ -495,7 +497,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static const auto jMethodGetList =
           JIValue::javaClassStatic()
               ->getMethod<facebook::jni::JArrayClass<
-                  JIValue::javaobject>::javaobject()>("getList");
+                  JIValue::javaobject>::javaobject()>("toList");
       auto jarray = jMethodGetList(jivalue);
       size_t n = jarray->size();
       if (n == 0) {
@@ -518,7 +520,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
       static const auto jMethodGetDictStringKey =
           JIValue::javaClassStatic()
               ->getMethod<facebook::jni::JMap<jstring, JIValue::javaobject>::
-                              javaobject()>("getDictStringKey");
+                              javaobject()>("toDictStringKey");
       auto jmap = jMethodGetDictStringKey(jivalue);
       auto it = jmap->begin();
       if (it == jmap->end()) {
@@ -541,7 +543,7 @@ class JIValue : public facebook::jni::JavaClass<JIValue> {
           JIValue::javaClassStatic()
               ->getMethod<facebook::jni::JMap<
                   facebook::jni::JLong::javaobject,
-                  JIValue::javaobject>::javaobject()>("getDictLongKey");
+                  JIValue::javaobject>::javaobject()>("toDictLongKey");
       auto jmap = jMethodGetDictLongKey(jivalue);
       auto it = jmap->begin();
       if (it == jmap->end()) {
diff --git a/android/pytorch_android/src/main/java/org/pytorch/DType.java b/android/pytorch_android/src/main/java/org/pytorch/DType.java
new file mode 100644
index 0000000000000..2278b16e0bb96
--- /dev/null
+++ b/android/pytorch_android/src/main/java/org/pytorch/DType.java
@@ -0,0 +1,29 @@
+package org.pytorch;
+
+/**
+ * Codes representing tensor data types.
+ */
+public enum DType {
+  // NOTE: "jniCode" must be kept in sync with pytorch_jni.cpp.
+  // NOTE: Never serialize "jniCode", because it can change between releases.
+
+  /** Code for dtype torch.uint8. {@link Tensor#dtype()} */
+  UINT8(1),
+  /** Code for dtype torch.int8. {@link Tensor#dtype()} */
+  INT8(2),
+  /** Code for dtype torch.int32. {@link Tensor#dtype()} */
+  INT32(3),
+  /** Code for dtype torch.float32. {@link Tensor#dtype()} */
+  FLOAT32(4),
+  /** Code for dtype torch.int64. {@link Tensor#dtype()} */
+  INT64(5),
+  /** Code for dtype torch.float64. {@link Tensor#dtype()} */
+  FLOAT64(6),
+  ;
+
+  final int jniCode;
+
+  DType(int jniCode) {
+    this.jniCode = jniCode;
+  }
+}
diff --git a/android/pytorch_android/src/main/java/org/pytorch/IValue.java b/android/pytorch_android/src/main/java/org/pytorch/IValue.java
index 5868721be543a..817fa56f2b358 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/IValue.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/IValue.java
@@ -98,7 +98,7 @@ public static IValue optionalNull() {
   /**
    * Creates a new IValue instance of torchscript Tensor type.
    */
-  public static IValue tensor(Tensor tensor) {
+  public static IValue from(Tensor tensor) {
     final IValue iv = new IValue(TYPE_CODE_TENSOR);
     iv.mData = tensor;
     return iv;
@@ -107,7 +107,7 @@ public static IValue tensor(Tensor tensor) {
   /**
    * Creates a new IValue instance of torchscript bool type.
    */
-  public static IValue bool(boolean value) {
+  public static IValue from(boolean value) {
     final IValue iv = new IValue(TYPE_CODE_BOOL);
     iv.mData = value;
     return iv;
@@ -116,7 +116,7 @@ public static IValue bool(boolean value) {
   /**
    * Creates a new IValue instance of torchscript int type.
    */
-  public static IValue long64(long value) {
+  public static IValue from(long value) {
     final IValue iv = new IValue(TYPE_CODE_LONG);
     iv.mData = value;
     return iv;
@@ -125,7 +125,7 @@ public static IValue long64(long value) {
   /**
    * Creates a new IValue instance of torchscript float type.
    */
-  public static IValue double64(double value) {
+  public static IValue from(double value) {
     final IValue iv = new IValue(TYPE_CODE_DOUBLE);
     iv.mData = value;
     return iv;
@@ -134,7 +134,7 @@ public static IValue double64(double value) {
   /**
    * Creates new IValue instance of torchscript str type.
    */
-  public static IValue string(String value) {
+  public static IValue from(String value) {
     final IValue iv = new IValue(TYPE_CODE_STRING);
     iv.mData = value;
     return iv;
@@ -143,7 +143,7 @@ public static IValue string(String value) {
   /**
    * Creates a new IValue instance of torchscript List[bool] type.
    */
-  public static IValue boolList(boolean... list) {
+  public static IValue listFrom(boolean... list) {
     final IValue iv = new IValue(TYPE_CODE_BOOL_LIST);
     iv.mData = list;
     return iv;
@@ -152,7 +152,7 @@ public static IValue boolList(boolean... list) {
   /**
    * Creates a new IValue instance of torchscript List[int] type.
    */
-  public static IValue longList(long... list) {
+  public static IValue listFrom(long... list) {
     final IValue iv = new IValue(TYPE_CODE_LONG_LIST);
     iv.mData = list;
     return iv;
@@ -161,7 +161,7 @@ public static IValue longList(long... list) {
   /**
    * Creates a new IValue instance of torchscript List[float] type.
    */
-  public static IValue doubleList(double... list) {
+  public static IValue listFrom(double... list) {
     final IValue iv = new IValue(TYPE_CODE_DOUBLE_LIST);
     iv.mData = list;
     return iv;
@@ -170,7 +170,7 @@ public static IValue doubleList(double... list) {
   /**
    * Creates a new IValue instance of torchscript List[Tensor] type.
    */
-  public static IValue tensorList(Tensor... list) {
+  public static IValue listFrom(Tensor... list) {
     final IValue iv = new IValue(TYPE_CODE_TENSOR_LIST);
     iv.mData = list;
     return iv;
@@ -179,7 +179,7 @@ public static IValue tensorList(Tensor... list) {
   /**
    * Creates a new IValue instance of torchscript List[T] type. All elements must have the same type.
    */
-  public static IValue list(IValue... array) {
+  public static IValue listFrom(IValue... array) {
     final int size = array.length;
     if (size > 0) {
       final int typeCode0 = array[0].mTypeCode;
@@ -198,7 +198,7 @@ public static IValue list(IValue... array) {
   /**
    * Creates a new IValue instance of torchscript Tuple[T0, T1, ...] type.
    */
-  public static IValue tuple(IValue... array) {
+  public static IValue tupleFrom(IValue... array) {
     final IValue iv = new IValue(TYPE_CODE_TUPLE);
     iv.mData = array;
     return iv;
@@ -207,7 +207,7 @@ public static IValue tuple(IValue... array) {
   /**
    * Creates a new IValue instance oftorchscript Dict[Str, V] type.
    */
-  public static IValue dictStringKey(Map<String, IValue> map) {
+  public static IValue dictStringKeyFrom(Map<String, IValue> map) {
     final IValue iv = new IValue(TYPE_CODE_DICT_STRING_KEY);
     iv.mData = map;
     return iv;
@@ -216,73 +216,73 @@ public static IValue dictStringKey(Map<String, IValue> map) {
   /**
    * Creates a new IValue instance of torchscript Dict[int, V] type.
    */
-  public static IValue dictLongKey(Map<Long, IValue> map) {
+  public static IValue dictLongKeyFrom(Map<Long, IValue> map) {
     final IValue iv = new IValue(TYPE_CODE_DICT_LONG_KEY);
     iv.mData = map;
     return iv;
   }
 
-  public Tensor getTensor() {
+  public Tensor toTensor() {
     preconditionType(TYPE_CODE_TENSOR, mTypeCode);
     return (Tensor) mData;
   }
 
-  public boolean getBool() {
+  public boolean toBool() {
     preconditionType(TYPE_CODE_BOOL, mTypeCode);
     return (boolean) mData;
   }
 
-  public long getLong() {
+  public long toLong() {
     preconditionType(TYPE_CODE_LONG, mTypeCode);
     return (long) mData;
   }
 
-  public double getDouble() {
+  public double toDouble() {
     preconditionType(TYPE_CODE_DOUBLE, mTypeCode);
     return (double) mData;
   }
 
-  public String getString() {
+  public String toStr() {
     preconditionType(TYPE_CODE_STRING, mTypeCode);
     return (String) mData;
   }
 
-  public boolean[] getBoolList() {
+  public boolean[] toBoolList() {
     preconditionType(TYPE_CODE_BOOL_LIST, mTypeCode);
     return (boolean[]) mData;
   }
 
-  public long[] getLongList() {
+  public long[] toLongList() {
     preconditionType(TYPE_CODE_LONG_LIST, mTypeCode);
     return (long[]) mData;
   }
 
-  public double[] getDoubleList() {
+  public double[] toDoubleList() {
     preconditionType(TYPE_CODE_DOUBLE_LIST, mTypeCode);
     return (double[]) mData;
   }
 
-  public Tensor[] getTensorList() {
+  public Tensor[] toTensorList() {
     preconditionType(TYPE_CODE_TENSOR_LIST, mTypeCode);
     return (Tensor[]) mData;
   }
 
-  public IValue[] getList() {
+  public IValue[] toList() {
     preconditionType(TYPE_CODE_LIST, mTypeCode);
     return (IValue[]) mData;
   }
 
-  public IValue[] getTuple() {
+  public IValue[] toTuple() {
     preconditionType(TYPE_CODE_TUPLE, mTypeCode);
     return (IValue[]) mData;
   }
 
-  public Map<String, IValue> getDictStringKey() {
+  public Map<String, IValue> toDictStringKey() {
     preconditionType(TYPE_CODE_DICT_STRING_KEY, mTypeCode);
     return (Map<String, IValue>) mData;
   }
 
-  public Map<Long, IValue> getDictLongKey() {
+  public Map<Long, IValue> toDictLongKey() {
     preconditionType(TYPE_CODE_DICT_LONG_KEY, mTypeCode);
     return (Map<Long, IValue>) mData;
   }
diff --git a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
index 4178a9adfb098..13ffd4314502d 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
@@ -15,20 +15,6 @@
  * {@link java.nio.DirectByteBuffer} of one of the supported types.
  */
 public abstract class Tensor {
-
-  /** Code for dtype torch.uint8. {@link Tensor#dtype()} */
-  public static final int DTYPE_UINT8 = 1;
-  /** Code for dtype torch.int8. {@link Tensor#dtype()} */
-  public static final int DTYPE_INT8 = 2;
-  /** Code for dtype torch.int32. {@link Tensor#dtype()} */
-  public static final int DTYPE_INT32 = 3;
-  /** Code for dtype torch.float32. {@link Tensor#dtype()} */
-  public static final int DTYPE_FLOAT32 = 4;
-  /** Code for dtype torch.int64. {@link Tensor#dtype()} */
-  public static final int DTYPE_INT64 = 5;
-  /** Code for dtype torch.float64. {@link Tensor#dtype()} */
-  public static final int DTYPE_FLOAT64 = 6;
-
   private static final String ERROR_MSG_DATA_BUFFER_NOT_NULL = "Data buffer must be not null";
   private static final String ERROR_MSG_DATA_ARRAY_NOT_NULL = "Data array must be not null";
   private static final String ERROR_MSG_SHAPE_NOT_NULL = "Shape must be not null";
@@ -39,8 +25,7 @@ public abstract class Tensor {
   private static final String ERROR_MSG_DATA_BUFFER_MUST_BE_DIRECT =
       "Data buffer must be direct (java.nio.ByteBuffer#allocateDirect)";
 
-  /** Shape of current tensor. */
-  public final long[] shape;
+  final long[] shape;
 
   private static final int INT_SIZE_BYTES = 4;
   private static final int FLOAT_SIZE_BYTES = 4;
@@ -49,8 +34,8 @@ public abstract class Tensor {
 
   /**
    * Allocates a new direct {@link java.nio.ByteBuffer} with native byte order with specified
-   * capacity that can be used in {@link Tensor#newInt8Tensor(long[], ByteBuffer)}, {@link
-   * Tensor#newUInt8Tensor(long[], ByteBuffer)}.
+   * capacity that can be used in {@link Tensor#fromBlob(ByteBuffer, long[])},
+   * {@link Tensor#fromBlobUnsigned(ByteBuffer, long[])}.
    *
    * @param numElements capacity (number of elements) of result buffer.
    */
@@ -66,7 +51,7 @@ public static IntBuffer allocateIntBuffer(int numElements) {
 
   /**
    * Allocates a new direct {@link java.nio.FloatBuffer} with native byte order with specified
-   * capacity that can be used in {@link Tensor#newFloat32Tensor(long[], FloatBuffer)}.
+   * capacity that can be used in {@link Tensor#fromBlob(FloatBuffer, long[])}.
    *
    * @param numElements capacity (number of elements) of result buffer.
    */
@@ -78,7 +63,7 @@ public static FloatBuffer allocateFloatBuffer(int numElements) {
 
   /**
    * Allocates a new direct {@link java.nio.LongBuffer} with native byte order with specified
-   * capacity that can be used in {@link Tensor#newInt64Tensor(long[], LongBuffer)}.
+   * capacity that can be used in {@link Tensor#fromBlob(LongBuffer, long[])}.
    *
    * @param numElements capacity (number of elements) of result buffer.
    */
@@ -90,7 +75,7 @@ public static LongBuffer allocateLongBuffer(int numElements) {
 
   /**
    * Allocates a new direct {@link java.nio.DoubleBuffer} with native byte order with specified
-   * capacity that can be used in {@link Tensor#newFloat64Tensor(long[], DoubleBuffer)}.
+   * capacity that can be used in {@link Tensor#fromBlob(DoubleBuffer, long[])}.
    *
    * @param numElements capacity (number of elements) of result buffer.
    */
@@ -104,10 +89,10 @@ public static DoubleBuffer allocateDoubleBuffer(int numElements) {
    * Creates a new Tensor instance with dtype torch.uint8 with specified shape and data as array of
    * bytes.
    *
-   * @param shape Tensor shape
    * @param data Tensor elements
+   * @param shape Tensor shape
    */
-  public static Tensor newUInt8Tensor(long[] shape, byte[] data) {
+  public static Tensor fromBlobUnsigned(byte[] data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_ARRAY_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -121,10 +106,10 @@ public static Tensor newUInt8Tensor(long[] shape, byte[] data) {
    * Creates a new Tensor instance with dtype torch.int8 with specified shape and data as array of
    * bytes.
    *
-   * @param shape Tensor shape
    * @param data Tensor elements
+   * @param shape Tensor shape
    */
-  public static Tensor newInt8Tensor(long[] shape, byte[] data) {
+  public static Tensor fromBlob(byte[] data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_ARRAY_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -138,10 +123,10 @@ public static Tensor newInt8Tensor(long[] shape, byte[] data) {
    * Creates a new Tensor instance with dtype torch.int32 with specified shape and data as array of
    * ints.
    *
-   * @param shape Tensor shape
    * @param data Tensor elements
+   * @param shape Tensor shape
    */
-  public static Tensor newInt32Tensor(long[] shape, int[] data) {
+  public static Tensor fromBlob(int[] data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_ARRAY_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -155,10 +140,10 @@ public static Tensor newInt32Tensor(long[] shape, int[] data) {
    * Creates a new Tensor instance with dtype torch.float32 with specified shape and data as array
    * of floats.
    *
-   * @param shape Tensor shape
    * @param data Tensor elements
+   * @param shape Tensor shape
    */
-  public static Tensor newFloat32Tensor(long[] shape, float[] data) {
+  public static Tensor fromBlob(float[] data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_ARRAY_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -172,10 +157,10 @@ public static Tensor newFloat32Tensor(long[] shape, float[] data) {
    * Creates a new Tensor instance with dtype torch.int64 with specified shape and data as array of
    * longs.
    *
-   * @param shape Tensor shape
    * @param data Tensor elements
+   * @param shape Tensor shape
    */
-  public static Tensor newInt64Tensor(long[] shape, long[] data) {
+  public static Tensor fromBlob(long[] data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_ARRAY_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -192,7 +177,7 @@ public static Tensor newInt64Tensor(long[] shape, long[] data) {
    * @param shape Tensor shape
    * @param data Tensor elements
    */
-  public static Tensor newFloat64Tensor(long[] shape, double[] data) {
+  public static Tensor fromBlob(long[] shape, double[] data) {
     checkArgument(data != null, ERROR_MSG_DATA_ARRAY_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -205,12 +190,12 @@ public static Tensor newFloat64Tensor(long[] shape, double[] data) {
   /**
    * Creates a new Tensor instance with dtype torch.uint8 with specified shape and data.
    *
-   * @param shape Tensor shape
    * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
+   * @param shape Tensor shape
    */
-  public static Tensor newUInt8Tensor(long[] shape, ByteBuffer data) {
+  public static Tensor fromBlobUnsigned(ByteBuffer data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_BUFFER_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -225,12 +210,12 @@ public static Tensor newUInt8Tensor(long[] shape, ByteBuffer data) {
   /**
    * Creates a new Tensor instance with dtype torch.int8 with specified shape and data.
    *
-   * @param shape Tensor shape
    * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
+   * @param shape Tensor shape
    */
-  public static Tensor newInt8Tensor(long[] shape, ByteBuffer data) {
+  public static Tensor fromBlob(ByteBuffer data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_BUFFER_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -245,12 +230,12 @@ public static Tensor newInt8Tensor(long[] shape, ByteBuffer data) {
   /**
    * Creates a new Tensor instance with dtype torch.int32 with specified shape and data.
    *
-   * @param shape Tensor shape
    * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
+   * @param shape Tensor shape
    */
-  public static Tensor newInt32Tensor(long[] shape, IntBuffer data) {
+  public static Tensor fromBlob(IntBuffer data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_BUFFER_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -265,12 +250,12 @@ public static Tensor newInt32Tensor(long[] shape, IntBuffer data) {
   /**
    * Creates a new Tensor instance with dtype torch.float32 with specified shape and data.
    *
-   * @param shape Tensor shape
    * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
+   * @param shape Tensor shape
    */
-  public static Tensor newFloat32Tensor(long[] shape, FloatBuffer data) {
+  public static Tensor fromBlob(FloatBuffer data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_BUFFER_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -285,12 +270,12 @@ public static Tensor newFloat32Tensor(long[] shape, FloatBuffer data) {
   /**
    * Creates a new Tensor instance with dtype torch.int64 with specified shape and data.
    *
-   * @param shape Tensor shape
    * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
+   * @param shape Tensor shape
    */
-  public static Tensor newInt64Tensor(long[] shape, LongBuffer data) {
+  public static Tensor fromBlob(LongBuffer data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_BUFFER_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -305,12 +290,12 @@ public static Tensor newInt64Tensor(long[] shape, LongBuffer data) {
   /**
    * Creates a new Tensor instance with dtype torch.float64 with specified shape and data.
    *
-   * @param shape Tensor shape
    * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
+   * @param shape Tensor shape
    */
-  public static Tensor newFloat64Tensor(long[] shape, DoubleBuffer data) {
+  public static Tensor fromBlob(DoubleBuffer data, long[] shape) {
     checkArgument(data != null, ERROR_MSG_DATA_BUFFER_NOT_NULL);
     checkArgument(shape != null, ERROR_MSG_SHAPE_NOT_NULL);
     checkShape(shape);
@@ -342,12 +327,19 @@ public static long numel(long[] shape) {
     return result;
   }
 
+  /** Shape of current tensor. */
+  public long[] shape() {
+    return Arrays.copyOf(shape, shape.length);
+  }
+
   /**
-   * Returns dtype of current tensor. Can be one of {@link Tensor#DTYPE_UINT8}, {@link
-   * Tensor#DTYPE_INT8}, {@link Tensor#DTYPE_INT32},{@link Tensor#DTYPE_FLOAT32}, {@link
-   * Tensor#DTYPE_INT64}, {@link Tensor#DTYPE_FLOAT64}.
+   * Returns dtype of current tensor.
    */
-  public abstract int dtype();
+  public abstract DType dtype();
+
+  int dtypeJniCode() {
+    return dtype().jniCode;
+  }
 
   /**
    * Returns newly allocated java byte array that contains a copy of tensor data.
@@ -423,8 +415,8 @@ private Tensor_uint8(ByteBuffer data, long[] shape) {
     }
 
     @Override
-    public int dtype() {
-      return DTYPE_UINT8;
+    public DType dtype() {
+      return DType.UINT8;
     }
 
     @Override
@@ -455,8 +447,8 @@ private Tensor_int8(ByteBuffer data, long[] shape) {
     }
 
     @Override
-    public int dtype() {
-      return DTYPE_INT8;
+    public DType dtype() {
+      return DType.INT8;
     }
 
     @Override
@@ -487,8 +479,8 @@ private Tensor_int32(IntBuffer data, long[] shape) {
     }
 
     @Override
-    public int dtype() {
-      return DTYPE_INT32;
+    public DType dtype() {
+      return DType.INT32;
     }
 
     @Override
@@ -527,8 +519,8 @@ public float[] getDataAsFloatArray() {
     }
 
     @Override
-    public int dtype() {
-      return DTYPE_FLOAT32;
+    public DType dtype() {
+      return DType.FLOAT32;
     }
 
     @Override
@@ -551,8 +543,8 @@ private Tensor_int64(LongBuffer data, long[] shape) {
     }
 
     @Override
-    public int dtype() {
-      return DTYPE_INT64;
+    public DType dtype() {
+      return DType.INT64;
     }
 
     @Override
@@ -583,8 +575,8 @@ private Tensor_float64(DoubleBuffer data, long[] shape) {
     }
 
     @Override
-    public int dtype() {
-      return DTYPE_FLOAT64;
+    public DType dtype() {
+      return DType.FLOAT64;
     }
 
     @Override
@@ -634,17 +626,17 @@ private static void checkShapeAndDataCapacityConsistency(int dataCapacity, long[
 
   // Called from native
   private static Tensor nativeNewTensor(ByteBuffer data, long[] shape, int dtype) {
-    if (DTYPE_FLOAT32 == dtype) {
+    if (DType.FLOAT32.jniCode == dtype) {
       return new Tensor_float32(data.asFloatBuffer(), shape);
-    } else if (DTYPE_INT32 == dtype) {
+    } else if (DType.INT32.jniCode == dtype) {
       return new Tensor_int32(data.asIntBuffer(), shape);
-    } else if (DTYPE_INT64 == dtype) {
+    } else if (DType.INT64.jniCode == dtype) {
       return new Tensor_int64(data.asLongBuffer(), shape);
-    } else if (DTYPE_FLOAT64 == dtype) {
+    } else if (DType.FLOAT64.jniCode == dtype) {
       return new Tensor_float64(data.asDoubleBuffer(), shape);
-    } else if (DTYPE_UINT8 == dtype) {
+    } else if (DType.UINT8.jniCode == dtype) {
       return new Tensor_uint8(data, shape);
-    } else if (DTYPE_INT8 == dtype) {
+    } else if (DType.INT8.jniCode == dtype) {
       return new Tensor_int8(data, shape);
     }
     throw new IllegalArgumentException("Unknown Tensor dtype");
diff --git a/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java b/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java
index 4c4258b4a4784..4aa740c2c5ee5 100644
--- a/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java
+++ b/android/pytorch_android_torchvision/src/main/java/org/pytorch/torchvision/TensorImageUtils.java
@@ -106,7 +106,7 @@ public static Tensor bitmapToFloat32Tensor(
 
     final FloatBuffer floatBuffer = Tensor.allocateFloatBuffer(3 * width * height);
     bitmapToFloatBuffer(bitmap, x, y, width, height, normMeanRGB, normStdRGB, floatBuffer, 0);
-    return Tensor.newFloat32Tensor(new long[]{1, 3, height, width}, floatBuffer);
+    return Tensor.fromBlob(floatBuffer, new long[]{1, 3, height, width});
   }
 
   /**
@@ -146,7 +146,7 @@ public static Tensor imageYUV420CenterCropToFloat32Tensor(
         tensorWidth,
         tensorHeight,
         normMeanRGB, normStdRGB, floatBuffer, 0);
-    return Tensor.newFloat32Tensor(new long[]{1, 3, tensorHeight, tensorWidth}, floatBuffer);
+    return Tensor.fromBlob(floatBuffer, new long[]{1, 3, tensorHeight, tensorWidth});
   }
 
   /**

From 4a28ab95d0c89c059b553c82ac09f7d3d1c60d1d Mon Sep 17 00:00:00 2001
From: David Reiss <dreiss@fb.com>
Date: Mon, 7 Oct 2019 16:58:27 -0700
Subject: [PATCH 100/113] Clean up JavaDoc comments in pytorch_android

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27455

Test Plan: Imported from OSS

Differential Revision: D17800658

Pulled By: dreiss

fbshipit-source-id: dbd01d9fa5ac82c50daf54c2869dc18be233d8dd
---
 .../src/main/java/org/pytorch/IValue.java     | 46 +++++++++-----
 .../src/main/java/org/pytorch/Module.java     | 40 ++++++------
 .../src/main/java/org/pytorch/Tensor.java     | 61 +++++++++++++------
 3 files changed, 91 insertions(+), 56 deletions(-)

diff --git a/android/pytorch_android/src/main/java/org/pytorch/IValue.java b/android/pytorch_android/src/main/java/org/pytorch/IValue.java
index 817fa56f2b358..99007cad3469f 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/IValue.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/IValue.java
@@ -4,10 +4,21 @@
 import java.util.Map;
 
 /**
- * Java representation of a torchscript variable, which is implemented as tagged union that can be
- * one of the supported types: https://pytorch.org/docs/stable/jit.html#types.
+ * Java representation of a TorchScript value, which is implemented as tagged union that can be
+ * one of the supported types: https://pytorch.org/docs/stable/jit.html#types .
  * <p>
- * Calling getters for inappropriate types will throw IllegalStateException.
+ * Calling {@code toX} methods for inappropriate types will throw {@link IllegalStateException}.
+ * <p>
+ * {@code IValue} objects are constructed with {@code IValue.from(value)},
+ * {@code IValue.tupleFrom(value1, value2, ...)}, {@code IValue.listFrom(value1, value2, ...)},
+ * or one of the {@code dict} methods, depending on the key type.
+ * <p>
+ * Data is retrieved from {@code IValue} objects with the {@code toX()} methods.  Note that
+ * {@code str}-type IValues must be extracted with {@link #toStr()},
+ * rather than {@link #toString()}.
+ * <p>
+ * {@code IValue} objects may retain references to objects passed into their constructors,
+ * and may return references to their internal state from {@code toX()}.
  */
 public class IValue {
   private static final int TYPE_CODE_NULL = 1;
@@ -91,12 +102,15 @@ public boolean isDictLongKey() {
     return TYPE_CODE_DICT_LONG_KEY == this.mTypeCode;
   }
 
+  /**
+   * Creates a new {@code IValue} of type {@code Optional} that contains no value.
+   */
   public static IValue optionalNull() {
     return new IValue(TYPE_CODE_NULL);
   }
 
   /**
-   * Creates a new IValue instance of torchscript Tensor type.
+   * Creates a new {@code IValue} of type {@code Tensor}.
    */
   public static IValue from(Tensor tensor) {
     final IValue iv = new IValue(TYPE_CODE_TENSOR);
@@ -105,7 +119,7 @@ public static IValue from(Tensor tensor) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript bool type.
+   * Creates a new {@code IValue} of type {@code bool}.
    */
   public static IValue from(boolean value) {
     final IValue iv = new IValue(TYPE_CODE_BOOL);
@@ -114,7 +128,7 @@ public static IValue from(boolean value) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript int type.
+   * Creates a new {@code IValue} of type {@code int}.
    */
   public static IValue from(long value) {
     final IValue iv = new IValue(TYPE_CODE_LONG);
@@ -123,7 +137,7 @@ public static IValue from(long value) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript float type.
+   * Creates a new {@code IValue} of type {@code float}.
    */
   public static IValue from(double value) {
     final IValue iv = new IValue(TYPE_CODE_DOUBLE);
@@ -132,7 +146,7 @@ public static IValue from(double value) {
   }
 
   /**
-   * Creates new IValue instance of torchscript str type.
+   * Creates a new {@code IValue} of type {@code str}.
    */
   public static IValue from(String value) {
     final IValue iv = new IValue(TYPE_CODE_STRING);
@@ -141,7 +155,7 @@ public static IValue from(String value) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript List[bool] type.
+   * Creates a new {@code IValue} of type {@code List[bool]}.
    */
   public static IValue listFrom(boolean... list) {
     final IValue iv = new IValue(TYPE_CODE_BOOL_LIST);
@@ -150,7 +164,7 @@ public static IValue listFrom(boolean... list) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript List[int] type.
+   * Creates a new {@code IValue} of type {@code List[int]}.
    */
   public static IValue listFrom(long... list) {
     final IValue iv = new IValue(TYPE_CODE_LONG_LIST);
@@ -159,7 +173,7 @@ public static IValue listFrom(long... list) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript List[float] type.
+   * Creates a new {@code IValue} of type {@code List[float]}.
    */
   public static IValue listFrom(double... list) {
     final IValue iv = new IValue(TYPE_CODE_DOUBLE_LIST);
@@ -168,7 +182,7 @@ public static IValue listFrom(double... list) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript List[Tensor] type.
+   * Creates a new {@code IValue} of type {@code List[Tensor]}.
    */
   public static IValue listFrom(Tensor... list) {
     final IValue iv = new IValue(TYPE_CODE_TENSOR_LIST);
@@ -177,7 +191,7 @@ public static IValue listFrom(Tensor... list) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript List[T] type. All elements must have the same type.
+   * Creates a new {@code IValue} of type {@code List[T]}.  All elements must have the same type.
    */
   public static IValue listFrom(IValue... array) {
     final int size = array.length;
@@ -196,7 +210,7 @@ public static IValue listFrom(IValue... array) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript Tuple[T0, T1, ...] type.
+   * Creates a new {@code IValue} of type {@code Tuple[T0, T1, ...]}.
    */
   public static IValue tupleFrom(IValue... array) {
     final IValue iv = new IValue(TYPE_CODE_TUPLE);
@@ -205,7 +219,7 @@ public static IValue tupleFrom(IValue... array) {
   }
 
   /**
-   * Creates a new IValue instance oftorchscript Dict[Str, V] type.
+   * Creates a new {@code IValue} of type {@code Dict[str, V]}.
    */
   public static IValue dictStringKeyFrom(Map<String, IValue> map) {
     final IValue iv = new IValue(TYPE_CODE_DICT_STRING_KEY);
@@ -214,7 +228,7 @@ public static IValue dictStringKeyFrom(Map<String, IValue> map) {
   }
 
   /**
-   * Creates a new IValue instance of torchscript Dict[int, V] type.
+   * Creates a new {@code IValue} of type {@code Dict[int, V]}.
    */
   public static IValue dictLongKeyFrom(Map<Long, IValue> map) {
     final IValue iv = new IValue(TYPE_CODE_DICT_LONG_KEY);
diff --git a/android/pytorch_android/src/main/java/org/pytorch/Module.java b/android/pytorch_android/src/main/java/org/pytorch/Module.java
index 04147d3c26960..4ca47a4491af7 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/Module.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Module.java
@@ -5,21 +5,20 @@
 import com.facebook.jni.HybridData;
 
 /**
- * Java holder for torch::jit::script::Module which owns it on jni side.
+ * Java wrapper for torch::jit::script::Module.
  */
 public class Module {
 
   private NativePeer mNativePeer;
 
   /**
-   * Loads serialized torchscript module from the specified absolute path on the disk.
+   * Loads a serialized TorchScript module from the specified path on the disk.
    *
-   * @param modelAbsolutePath absolute path to file that contains the serialized torchscript module.
-   * @return new {@link org.pytorch.Module} object which owns torch::jit::script::Module on jni
-   * side.
+   * @param modelPath path to file that contains the serialized TorchScript module.
+   * @return new {@link org.pytorch.Module} object which owns torch::jit::script::Module.
    */
-  public static Module load(final String modelAbsolutePath) {
-    return new Module(modelAbsolutePath);
+  public static Module load(final String modelPath) {
+    return new Module(modelPath);
   }
 
   private Module(final String moduleAbsolutePath) {
@@ -27,35 +26,32 @@ private Module(final String moduleAbsolutePath) {
   }
 
   /**
-   * Runs 'forward' method of loaded torchscript module with specified arguments.
+   * Runs the 'forward' method of this module with the specified arguments.
    *
-   * @param inputs arguments for torchscript module 'forward' method.
-   * @return result of torchscript module 'forward' method evaluation
+   * @param inputs arguments for the TorchScript module's 'forward' method.
+   * @return return value from the 'forward' method.
    */
   public IValue forward(IValue... inputs) {
     return mNativePeer.forward(inputs);
   }
 
   /**
-   * Runs specified method of loaded torchscript module with specified arguments.
+   * Runs the specified method of this module with the specified arguments.
    *
-   * @param methodName torchscript module method to run
-   * @param inputs     arguments that will be specified to torchscript module method call
-   * @return result of torchscript module specified method evaluation
+   * @param methodName name of the TorchScript method to run.
+   * @param inputs     arguments that will be passed to TorchScript method.
+   * @return return value from the method.
    */
   public IValue runMethod(String methodName, IValue... inputs) {
     return mNativePeer.runMethod(methodName, inputs);
   }
 
   /**
-   * Explicitly destructs native part. Current instance can not be used after this call. This
-   * method may be called multiple times safely. As fbjni library destructs native part
-   * automatically when current instance will be
-   * collected by Java GC, the instance will not leak if this method is not called,
-   * but timing of deletion and the thread will be at the whim of the Java GC.
-   * If you want to control the thread and timing of the destructor, you should call this method
-   * explicitly.
-   * {@link com.facebook.jni.HybridData#resetNative}
+   * Explicitly destroys the native torch::jit::script::Module.
+   * Calling this method is not required, as the native object will be destroyed
+   * when this object is garbage-collected.  However, the timing of garbage collection
+   * is not guaranteed, so proactively calling {@code destroy} can free memory more quickly.
+   * See {@link com.facebook.jni.HybridData#resetNative}.
    */
   public void destroy() {
     mNativePeer.mHybridData.resetNative();
diff --git a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
index 13ffd4314502d..4fdcf448a6855 100644
--- a/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
+++ b/android/pytorch_android/src/main/java/org/pytorch/Tensor.java
@@ -11,8 +11,22 @@
 import java.util.Locale;
 
 /**
- * Representation of Tensor. Tensor shape is stored in {@link Tensor#shape}, elements are stored as
- * {@link java.nio.DirectByteBuffer} of one of the supported types.
+ * Representation of a Tensor.  Behavior is similar to PyTorch's tensor objects.
+ * <p>
+ * Most tensors will be constructed as {@code Tensor.fromBlob(data, shape)},
+ * where {@code data} can be an array or a direct {@link Buffer} (of the proper subclass).
+ * Helper methods are provided to allocate buffers properly.
+ * <p>
+ * To access Tensor data, see {@link #dtype()}, {@link #shape()},
+ * and various {@code getDataAs*} methods.
+ * <p>
+ * When constructing {@code Tensor} objects with {@code data} as an array,
+ * it is not specified whether this data is is copied or retained as a reference
+ * so it is recommended not to modify it after constructing.  {@code data} passed as a
+ * {@link Buffer} is not copied, so it can be modified between {@link Module} calls
+ * to avoid reallocation.  Data retrieved from {@code Tensor} objects may be copied or
+ * may be a reference to the {@code Tensor}'s internal data buffer.
+ * {@code shape} is always copied.
  */
 public abstract class Tensor {
   private static final String ERROR_MSG_DATA_BUFFER_NOT_NULL = "Data buffer must be not null";
@@ -43,6 +57,12 @@ public static ByteBuffer allocateByteBuffer(int numElements) {
     return ByteBuffer.allocateDirect(numElements).order(ByteOrder.nativeOrder());
   }
 
+  /**
+   * Allocates a new direct {@link java.nio.IntBuffer} with native byte order with specified
+   * capacity that can be used in {@link Tensor#fromBlob(IntBuffer, long[])}.
+   *
+   * @param numElements capacity (number of elements) of result buffer.
+   */
   public static IntBuffer allocateIntBuffer(int numElements) {
     return ByteBuffer.allocateDirect(numElements * INT_SIZE_BYTES)
         .order(ByteOrder.nativeOrder())
@@ -190,7 +210,7 @@ public static Tensor fromBlob(long[] shape, double[] data) {
   /**
    * Creates a new Tensor instance with dtype torch.uint8 with specified shape and data.
    *
-   * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
+   * @param data Direct buffer with native byte order that contains {@code Tensor.numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
    * @param shape Tensor shape
@@ -210,7 +230,7 @@ public static Tensor fromBlobUnsigned(ByteBuffer data, long[] shape) {
   /**
    * Creates a new Tensor instance with dtype torch.int8 with specified shape and data.
    *
-   * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
+   * @param data Direct buffer with native byte order that contains {@code Tensor.numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
    * @param shape Tensor shape
@@ -230,7 +250,7 @@ public static Tensor fromBlob(ByteBuffer data, long[] shape) {
   /**
    * Creates a new Tensor instance with dtype torch.int32 with specified shape and data.
    *
-   * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
+   * @param data Direct buffer with native byte order that contains {@code Tensor.numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
    * @param shape Tensor shape
@@ -250,7 +270,7 @@ public static Tensor fromBlob(IntBuffer data, long[] shape) {
   /**
    * Creates a new Tensor instance with dtype torch.float32 with specified shape and data.
    *
-   * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
+   * @param data Direct buffer with native byte order that contains {@code Tensor.numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
    * @param shape Tensor shape
@@ -270,7 +290,7 @@ public static Tensor fromBlob(FloatBuffer data, long[] shape) {
   /**
    * Creates a new Tensor instance with dtype torch.int64 with specified shape and data.
    *
-   * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
+   * @param data Direct buffer with native byte order that contains {@code Tensor.numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
    * @param shape Tensor shape
@@ -290,7 +310,7 @@ public static Tensor fromBlob(LongBuffer data, long[] shape) {
   /**
    * Creates a new Tensor instance with dtype torch.float64 with specified shape and data.
    *
-   * @param data Direct buffer with native byte order that contains {@code Tensor#numel(shape)}
+   * @param data Direct buffer with native byte order that contains {@code Tensor.numel(shape)}
    *     elements. The buffer is used directly without copying, and changes to its content will
    *     change the tensor.
    * @param shape Tensor shape
@@ -312,12 +332,14 @@ private Tensor(long[] shape) {
     this.shape = Arrays.copyOf(shape, shape.length);
   }
 
-  /** Calculates number of elements in current tensor instance. */
+  /** Returns the number of elements in this tensor. */
   public long numel() {
     return numel(this.shape);
   }
 
-  /** Calculates number of elements in tensor with specified shape. */
+  /**
+   * Calculates the number of elements in a tensor with the specified shape.
+   */
   public static long numel(long[] shape) {
     checkShape(shape);
     int result = 1;
@@ -327,22 +349,25 @@ public static long numel(long[] shape) {
     return result;
   }
 
-  /** Shape of current tensor. */
+  /**
+   * Returns the shape of this tensor.  (The array is a fresh copy.)
+   */
   public long[] shape() {
     return Arrays.copyOf(shape, shape.length);
   }
 
   /**
-   * Returns dtype of current tensor.
+   * @return data type of this tensor.
    */
   public abstract DType dtype();
 
+  // Called from native
   int dtypeJniCode() {
     return dtype().jniCode;
   }
 
   /**
-   * Returns newly allocated java byte array that contains a copy of tensor data.
+   * @return a Java byte array that contains the tensor data.  This may be a copy or reference.
    *
    * @throws IllegalStateException if it is called for a non-int8 tensor.
    */
@@ -352,7 +377,7 @@ public byte[] getDataAsByteArray() {
   }
 
   /**
-   * Returns newly allocated java byte array that contains a copy of tensor data.
+   * @return a Java byte array that contains the tensor data.  This may be a copy or reference.
    *
    * @throws IllegalStateException if it is called for a non-uint8 tensor.
    */
@@ -362,7 +387,7 @@ public byte[] getDataAsUnsignedByteArray() {
   }
 
   /**
-   * Returns newly allocated java byte array that contains a copy of tensor data.
+   * @return a Java int array that contains the tensor data.  This may be a copy or reference.
    *
    * @throws IllegalStateException if it is called for a non-int32 tensor.
    */
@@ -372,7 +397,7 @@ public int[] getDataAsIntArray() {
   }
 
   /**
-   * Returns newly allocated java byte array that contains a copy of tensor data.
+   * @return a Java float array that contains the tensor data.  This may be a copy or reference.
    *
    * @throws IllegalStateException if it is called for a non-float32 tensor.
    */
@@ -382,7 +407,7 @@ public float[] getDataAsFloatArray() {
   }
 
   /**
-   * Returns newly allocated java byte array that contains a copy of tensor data.
+   * @return a Java long array that contains the tensor data.  This may be a copy or reference.
    *
    * @throws IllegalStateException if it is called for a non-int64 tensor.
    */
@@ -392,7 +417,7 @@ public long[] getDataAsLongArray() {
   }
 
   /**
-   * Returns newly allocated java byte array that contains a copy of tensor data.
+   * @return a Java double array that contains the tensor data.  This may be a copy or reference.
    *
    * @throws IllegalStateException if it is called for a non-float64 tensor.
    */

From ce16d689b362a1d675942d1ab016e5112db913bf Mon Sep 17 00:00:00 2001
From: David <david@angle-tech.com>
Date: Mon, 7 Oct 2019 17:11:49 -0700
Subject: [PATCH 101/113] FunctionEventAvg implements __iadd__ interface
 (#27498)

Summary:
Resolving issue https://github.com/pytorch/pytorch/issues/26433 by making FunctionEventAvg implement the `__iadd__` interface again, like it used to.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27498

Differential Revision: D17801918

Pulled By: ezyang

fbshipit-source-id: 0597059c903ac168ed64a05ac1decff3ffd14f06
---
 torch/autograd/profiler.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
index 0f4ae714380ec..ea1a797c12c8b 100644
--- a/torch/autograd/profiler.py
+++ b/torch/autograd/profiler.py
@@ -613,6 +613,9 @@ def add(self, other, group_by_input_shapes=False):
         self.count += 1
         return self
 
+    def __iadd__(self, other):
+        return self.add(other)
+
     def __repr__(self):
         return (
             '<FunctionEventAvg key={} self_cpu_time={} cpu_time={} '

From 4bd8ae13c68644b977e10ab87ac57e7332fb7994 Mon Sep 17 00:00:00 2001
From: Your Name <bai@in.tum.de>
Date: Mon, 7 Oct 2019 17:22:33 -0700
Subject: [PATCH 102/113] Move hipify to torch/utils to bundle them into torch
 package (#27425)

Summary:
Similar to https://github.com/pytorch/pytorch/pull/27418 but try to put it under "torch" namespace
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27425

Differential Revision: D17779490

Pulled By: bddppq

fbshipit-source-id: 688338d143509b37dfc110df17af3331db48a42b
---
 .flake8                                       |    2 +-
 .../hip/impl/HIPGuardImplMasqueradingAsCUDA.h |    2 +-
 c10/cuda/CMakeLists.txt                       |    2 +-
 c10/cuda/README.md                            |    2 +-
 test/test_utils.py                            |    5 +
 tools/amd_build/build_amd.py                  |   12 +-
 .../pyHIPIFY/cuda_to_hip_mappings.py          | 2414 -----
 .../utils/hipify}/__init__.py                 |    0
 .../utils/hipify}/constants.py                |    0
 torch/utils/hipify/cuda_to_hip_mappings.py    | 8083 +++++++++++++++++
 .../utils/hipify}/hipify_python.py            |   16 +-
 11 files changed, 8112 insertions(+), 2426 deletions(-)
 delete mode 100644 tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
 rename {tools/amd_build/pyHIPIFY => torch/utils/hipify}/__init__.py (100%)
 rename {tools/amd_build/pyHIPIFY => torch/utils/hipify}/constants.py (100%)
 create mode 100644 torch/utils/hipify/cuda_to_hip_mappings.py
 rename {tools/amd_build/pyHIPIFY => torch/utils/hipify}/hipify_python.py (98%)

diff --git a/.flake8 b/.flake8
index bb1ecb6fb4bd9..d5d0a4544676f 100644
--- a/.flake8
+++ b/.flake8
@@ -11,4 +11,4 @@ ignore =
     B007,B008,
     # these ignores are from flake8-comprehensions; please fix!
     C400,C401,C402,C403,C404,C405,C407,C411,
-exclude = docs/src,venv,third_party,caffe2,scripts,docs/caffe2,tools/amd_build/pyHIPIFY,torch/lib/include,torch/lib/tmp_install,build,torch/include,*.pyi
+exclude = docs/src,venv,third_party,caffe2,scripts,docs/caffe2,torch/lib/include,torch/lib/tmp_install,build,torch/include,*.pyi
diff --git a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
index 7fa2bcbca01e5..8ad73a9be7044 100644
--- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
+++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h
@@ -44,7 +44,7 @@ namespace c10 { namespace hip {
 // we switch PyTorch to calling a HIP a HIP.
 //
 // When you add a new MasqueradingAsCUDA class/function, you need to
-// also update the rewrite rules in tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+// also update the rewrite rules in torch/utils/hipify/cuda_to_hip_mappings.py
 //
 //
 //
diff --git a/c10/cuda/CMakeLists.txt b/c10/cuda/CMakeLists.txt
index 3514fac0eeae1..e992a4e1bfdae 100644
--- a/c10/cuda/CMakeLists.txt
+++ b/c10/cuda/CMakeLists.txt
@@ -17,7 +17,7 @@ configure_file(
 # transitively passed on to all libraries dependent on PyTorch.
 
 # Note: if you add a new source file/header, you will need to update
-# tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py for new files
+# torch/utils/hipify/cuda_to_hip_mappings.py for new files
 # and headers you add
 set(C10_CUDA_SRCS
     CUDAStream.cpp
diff --git a/c10/cuda/README.md b/c10/cuda/README.md
index 1cafbc78fad8d..c65ba8e3b155e 100644
--- a/c10/cuda/README.md
+++ b/c10/cuda/README.md
@@ -30,7 +30,7 @@ void my_func();
 ```
 
 Thus, if you add new functionality to c10, you must also update `C10_MAPPINGS`
-`tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py` to transpile
+`torch/utils/hipify/cuda_to_hip_mappings.py` to transpile
 occurrences of `cuda::my_func` to `hip::my_func`.  (At the moment,
 we do NOT have a catch all `cuda::` to `hip::` namespace conversion,
 as not all `cuda` namespaces are converted to `hip::`, even though
diff --git a/test/test_utils.py b/test/test_utils.py
index 619cb626214f9..df032d2f7a5ac 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -570,5 +570,10 @@ def test_load_zip_checkpoint(self):
                          SUM_OF_HUB_EXAMPLE)
 
 
+class TestHipify(TestCase):
+    def test_import_hipify(self):
+        from torch.utils.hipify import hipify_python # noqa
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/amd_build/build_amd.py b/tools/amd_build/build_amd.py
index 48e8ecceabdea..60f85268bad8f 100644
--- a/tools/amd_build/build_amd.py
+++ b/tools/amd_build/build_amd.py
@@ -4,8 +4,16 @@
 import os
 import subprocess
 import argparse
-
-from pyHIPIFY import hipify_python
+import sys
+sys.path.append(os.path.realpath(os.path.join(
+    __file__,
+    os.path.pardir,
+    os.path.pardir,
+    os.path.pardir,
+    'torch',
+    'utils')))
+
+from hipify import hipify_python
 
 parser = argparse.ArgumentParser(description='Top-level script for HIPifying, filling in most common parameters')
 parser.add_argument(
diff --git a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py b/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
deleted file mode 100644
index 1e04f687e457e..0000000000000
--- a/tools/amd_build/pyHIPIFY/cuda_to_hip_mappings.py
+++ /dev/null
@@ -1,2414 +0,0 @@
-#!/usr/bin/env python3
-import collections
-
-from pyHIPIFY.constants import *
-
-""" Mapping of CUDA functions, include files, constants, and types to ROCm/HIP equivalents
-This closely follows the implementation in hipify-clang
-https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/CUDA2HipMap.cpp
-and its structure.
-There are different maps for fundamental names, include files, identifies, sparse, and
-PyTorch specific translations.
-Each of the entries in these maps translates a CUDA string to a tuple containing the
-ROCm/HIP string, a type and API annotation and - optionally - an annotation if it is not
-supported in ROCm/HIP yet.
-"""
-
-# List of math functions that should be replaced inside device code only.
-MATH_TRANSPILATIONS = collections.OrderedDict([
-    ("std::max", ("::max")),
-    ("std::min", ("::min")),
-    ("std::ceil", ("::ceil")),
-    ("std::floor", ("::floor")),
-    ("std::exp", ("::exp")),
-    ("std::log", ("::log")),
-    ("std::pow", ("::pow")),
-    ("std::fabs", ("::fabs")),
-    ("std::fmod", ("::fmod")),
-    ("std::remainder", ("::remainder")),
-])
-
-CUDA_TYPE_NAME_MAP = collections.OrderedDict([
-    ("CUresult", ("hipError_t", CONV_TYPE, API_DRIVER)),
-    ("cudaError_t", ("hipError_t", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ARRAY3D_DESCRIPTOR", ("HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ARRAY_DESCRIPTOR", ("HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER)),
-    ("CUDA_MEMCPY2D", ("hip_Memcpy2D", CONV_TYPE, API_DRIVER)),
-    ("CUDA_MEMCPY3D", ("HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_MEMCPY3D_PEER", ("HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_POINTER_ATTRIBUTE_P2P_TOKENS", ("HIP_POINTER_ATTRIBUTE_P2P_TOKENS", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_RESOURCE_DESC", ("HIP_RESOURCE_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_RESOURCE_VIEW_DESC", ("HIP_RESOURCE_VIEW_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUipcEventHandle", ("hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUipcMemHandle", ("hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUaddress_mode", ("hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUarray_cubemap_face", ("hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUarray_format", ("hipArray_format", CONV_TYPE, API_DRIVER)),
-    ("CUcomputemode", ("hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUmem_advise", ("hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUmem_range_attribute", ("hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUctx_flags", ("hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUdevice", ("hipDevice_t", CONV_TYPE, API_DRIVER)),
-    ("CUdevice_attribute_enum", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)),
-    ("CUdevice_attribute", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)),
-    ("CUdeviceptr", ("hipDeviceptr_t", CONV_TYPE, API_DRIVER)),
-    ("CUarray_st", ("hipArray", CONV_TYPE, API_DRIVER)),
-    ("CUarray", ("hipArray *", CONV_TYPE, API_DRIVER)),
-    ("CUdevprop_st", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)),
-    ("CUdevprop", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)),
-    ("CUfunction", ("hipFunction_t", CONV_TYPE, API_DRIVER)),
-    ("CUgraphicsResource", ("hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUmipmappedArray", ("hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUfunction_attribute", ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUfunction_attribute_enum", ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUgraphicsMapResourceFlags", ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUgraphicsMapResourceFlags_enum", ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUgraphicsRegisterFlags", ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUgraphicsRegisterFlags_enum", ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUoccupancy_flags", ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUoccupancy_flags_enum", ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUfunc_cache_enum", ("hipFuncCache", CONV_TYPE, API_DRIVER)),
-    ("CUfunc_cache", ("hipFuncCache", CONV_TYPE, API_DRIVER)),
-    ("CUipcMem_flags", ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUipcMem_flags_enum", ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjit_cacheMode", ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjit_cacheMode_enum", ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjit_fallback", ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjit_fallback_enum", ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjit_option", ("hipJitOption", CONV_JIT, API_DRIVER)),
-    ("CUjit_option_enum", ("hipJitOption", CONV_JIT, API_DRIVER)),
-    ("CUjit_target", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjit_target_enum", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjitInputType", ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUjitInputType_enum", ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUlimit", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
-    ("CUlimit_enum", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
-    ("CUmemAttach_flags", ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUmemAttach_flags_enum", ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUmemorytype", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUmemorytype_enum", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUresourcetype", ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUresourcetype_enum", ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUresourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)),
-    ("CUresourceViewFormat_enum", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)),
-    ("CUsharedconfig", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)),
-    ("CUsharedconfig_enum", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)),
-    ("CUcontext", ("hipCtx_t", CONV_TYPE, API_DRIVER)),
-    ("CUmodule", ("hipModule_t", CONV_TYPE, API_DRIVER)),
-    ("CUstream", ("hipStream_t", CONV_TYPE, API_DRIVER)),
-    ("CUstream_st", ("ihipStream_t", CONV_TYPE, API_DRIVER)),
-    ("CUstreamCallback", ("hipStreamCallback_t", CONV_TYPE, API_DRIVER)),
-    ("CUsurfObject", ("hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUsurfref", ("hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUtexObject", ("hipTextureObject_t", CONV_TYPE, API_DRIVER)),
-    ("CUtexref", ("textureReference", CONV_TYPE, API_DRIVER)),
-    ("CUstream_flags", ("hipStreamFlags", CONV_TYPE, API_DRIVER)),
-    ("CUstreamWaitValue_flags", ("hipStreamWaitValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUstreamWriteValue_flags", ("hipStreamWriteValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUstreamBatchMemOpType", ("hipStreamBatchMemOpType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUdevice_P2PAttribute", ("hipDeviceP2PAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUevent", ("hipEvent_t", CONV_TYPE, API_DRIVER)),
-    ("CUevent_st", ("ihipEvent_t", CONV_TYPE, API_DRIVER)),
-    ("CUevent_flags", ("hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUfilter_mode", ("hipTextureFilterMode", CONV_TEX, API_DRIVER)),
-    ("CUGLDeviceList", ("hipGLDeviceList", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUGLmap_flags", ("hipGLMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUd3d9DeviceList", ("hipD3D9DeviceList", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUd3d9map_flags", ("hipD3D9MapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUd3d9register_flags", ("hipD3D9RegisterFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUd3d10DeviceList", ("hipd3d10DeviceList", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUd3d10map_flags", ("hipD3D10MapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUd3d10register_flags", ("hipD3D10RegisterFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUd3d11DeviceList", ("hipd3d11DeviceList", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUeglStreamConnection_st", ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUeglStreamConnection", ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("libraryPropertyType_t", ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("libraryPropertyType", ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaStreamCallback_t", ("hipStreamCallback_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaArray", ("hipArray", CONV_MEM, API_RUNTIME)),
-    ("cudaArray_t", ("hipArray_t", CONV_MEM, API_RUNTIME)),
-    ("cudaArray_const_t", ("hipArray_const_t", CONV_MEM, API_RUNTIME)),
-    ("cudaMipmappedArray_t", ("hipMipmappedArray_t", CONV_MEM, API_RUNTIME)),
-    ("cudaMipmappedArray_const_t", ("hipMipmappedArray_const_t", CONV_MEM, API_RUNTIME)),
-    ("cudaArrayDefault", ("hipArrayDefault", CONV_MEM, API_RUNTIME)),
-    ("cudaArrayLayered", ("hipArrayLayered", CONV_MEM, API_RUNTIME)),
-    ("cudaArraySurfaceLoadStore", ("hipArraySurfaceLoadStore", CONV_MEM, API_RUNTIME)),
-    ("cudaArrayCubemap", ("hipArrayCubemap", CONV_MEM, API_RUNTIME)),
-    ("cudaArrayTextureGather", ("hipArrayTextureGather", CONV_MEM, API_RUNTIME)),
-    ("cudaMemoryAdvise", ("hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemRangeAttribute", ("hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpyKind", ("hipMemcpyKind", CONV_MEM, API_RUNTIME)),
-    ("cudaMemoryType", ("hipMemoryType", CONV_MEM, API_RUNTIME)),
-    ("cudaExtent", ("hipExtent", CONV_MEM, API_RUNTIME)),
-    ("cudaPitchedPtr", ("hipPitchedPtr", CONV_MEM, API_RUNTIME)),
-    ("cudaPos", ("hipPos", CONV_MEM, API_RUNTIME)),
-    ("cudaEvent_t", ("hipEvent_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaStream_t", ("hipStream_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaPointerAttributes", ("hipPointerAttribute_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceAttr", ("hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceProp", ("hipDeviceProp_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceP2PAttr", ("hipDeviceP2PAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaComputeMode", ("hipComputeMode", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaFuncCache", ("hipFuncCache_t", CONV_CACHE, API_RUNTIME)),
-    ("cudaFuncAttributes", ("hipFuncAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaSharedMemConfig", ("hipSharedMemConfig", CONV_TYPE, API_RUNTIME)),
-    ("cudaLimit", ("hipLimit_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaOutputMode", ("hipOutputMode", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaTextureReadMode", ("hipTextureReadMode", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureFilterMode", ("hipTextureFilterMode", CONV_TEX, API_RUNTIME)),
-    ("cudaChannelFormatKind", ("hipChannelFormatKind", CONV_TEX, API_RUNTIME)),
-    ("cudaChannelFormatDesc", ("hipChannelFormatDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceDesc", ("hipResourceDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceViewDesc", ("hipResourceViewDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureDesc", ("hipTextureDesc", CONV_TEX, API_RUNTIME)),
-    ("surfaceReference", ("hipSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaTextureObject_t", ("hipTextureObject_t", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceType", ("hipResourceType", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureAddressMode", ("hipTextureAddressMode", CONV_TEX, API_RUNTIME)),
-    ("cudaSurfaceBoundaryMode", ("hipSurfaceBoundaryMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaSurfaceFormatMode", ("hipSurfaceFormatMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaTextureType1D", ("hipTextureType1D", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureType2D", ("hipTextureType2D", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureType3D", ("hipTextureType3D", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureTypeCubemap", ("hipTextureTypeCubemap", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureType1DLayered", ("hipTextureType1DLayered", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureType2DLayered", ("hipTextureType2DLayered", CONV_TEX, API_RUNTIME)),
-    ("cudaTextureTypeCubemapLayered", ("hipTextureTypeCubemapLayered", CONV_TEX, API_RUNTIME)),
-    ("cudaIpcEventHandle_t", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaIpcEventHandle_st", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaIpcMemHandle_t", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaIpcMemHandle_st", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)),
-    ("cudaGraphicsCubeFace", ("hipGraphicsCubeFace", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsMapFlags", ("hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsRegisterFlags", ("hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLDeviceList", ("hipGLDeviceList", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLMapFlags", ("hipGLMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9DeviceList", ("hipD3D9DeviceList", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9MapFlags", ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9RegisterFlags", ("hipD3D9RegisterFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10DeviceList", ("hipd3d10DeviceList", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10MapFlags", ("hipD3D10MapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10RegisterFlags", ("hipD3D10RegisterFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11DeviceList", ("hipd3d11DeviceList", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEglStreamConnection", ("hipEglStreamConnection", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cublasHandle_t", ("rocblas_handle", CONV_TYPE, API_BLAS)),
-    ("cublasOperation_t", ("rocblas_operation", CONV_TYPE, API_BLAS)),
-    ("cublasStatus_t", ("rocblas_status", CONV_TYPE, API_BLAS)),
-    ("cublasFillMode_t", ("rocblas_fill", CONV_TYPE, API_BLAS)),
-    ("cublasDiagType_t", ("rocblas_diagonal", CONV_TYPE, API_BLAS)),
-    ("cublasSideMode_t", ("rocblas_side", CONV_TYPE, API_BLAS)),
-    ("cublasPointerMode_t", ("rocblas_pointer_mode", CONV_TYPE, API_BLAS)),
-    ("cublasAtomicsMode_t", ("rocblas_atomics_mode", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDataType_t", ("rocblas_data_type", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED)),
-    ("curandStatus", ("hiprandStatus_t", CONV_TYPE, API_RAND)),
-    ("curandStatus_t", ("hiprandStatus_t", CONV_TYPE, API_RAND)),
-    ("curandRngType", ("hiprandRngType_t", CONV_TYPE, API_RAND)),
-    ("curandRngType_t", ("hiprandRngType_t", CONV_TYPE, API_RAND)),
-    ("curandGenerator_st", ("hiprandGenerator_st", CONV_TYPE, API_RAND)),
-    ("curandGenerator_t", ("hiprandGenerator_t", CONV_TYPE, API_RAND)),
-    ("curandDirectionVectorSet", ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDirectionVectorSet_t", ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandOrdering", ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandOrdering_t", ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDistribution_st", ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandHistogramM2V_st", ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDistribution_t", ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandHistogramM2V_t", ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDistributionShift_st", ("hiprandDistributionShift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDistributionShift_t", ("hiprandDistributionShift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDistributionM2Shift_st", ("hiprandDistributionM2Shift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDistributionM2Shift_t", ("hiprandDistributionM2Shift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandHistogramM2_st", ("hiprandHistogramM2_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandHistogramM2_t", ("hiprandHistogramM2_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandHistogramM2K_st", ("hiprandHistogramM2K_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandHistogramM2K_t", ("hiprandHistogramM2K_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDiscreteDistribution_st", ("hiprandDiscreteDistribution_st", CONV_TYPE, API_RAND)),
-    ("curandDiscreteDistribution_t", ("hiprandDiscreteDistribution_t", CONV_TYPE, API_RAND)),
-    ("curandMethod", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandMethod_t", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandDirectionVectors32_t", ("hiprandDirectionVectors32_t", CONV_TYPE, API_RAND)),
-    ("curandDirectionVectors64_t", ("hiprandDirectionVectors64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandStateMtgp32_t", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)),
-    ("curandStateMtgp32", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)),
-    ("curandStateScrambledSobol64_t", ("hiprandStateScrambledSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandStateSobol64_t", ("hiprandStateSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandStateScrambledSobol32_t", ("hiprandStateScrambledSobol32_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
-    ("curandStateSobol32_t", ("hiprandStateSobol32_t", CONV_TYPE, API_RAND)),
-    ("curandStateMRG32k3a_t", ("hiprandStateMRG32k3a_t", CONV_TYPE, API_RAND)),
-    ("curandStatePhilox4_32_10_t", ("hiprandStatePhilox4_32_10_t", CONV_TYPE, API_RAND)),
-    ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)),
-    ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)),
-    ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)),
-])
-
-CUDA_INCLUDE_MAP = collections.OrderedDict([
-    # since pytorch uses "\b{pattern}\b" as the actual re pattern,
-    # patterns listed here have to begin and end with alnum chars
-    ("include <cuda.h", ("include <hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER)),
-    ('include "cuda.h', ('include "hip/hip_runtime.h', CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER)),
-    ("cuda_runtime.h", ("hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME)),
-    ("cuda_runtime_api.h", ("hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME)),
-    ("channel_descriptor.h", ("hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME)),
-    ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)),
-    ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)),
-    ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
-    ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
-    ("cuda_texture_types.h", ("hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME)),
-    ("vector_types.h", ("hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME)),
-    ("cublas.h", ("rocblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS)),
-    ("cublas_v2.h", ("rocblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS)),
-    ("curand.h", ("hiprand.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND)),
-    ("curand_kernel.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_discrete.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_discrete2.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_globals.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_lognormal.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_mrg32k3a.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_mtgp32.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_mtgp32_host.h", ("hiprand_mtgp32_host.h", CONV_INCLUDE, API_RAND)),
-    ("curand_mtgp32_kernel.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_mtgp32dc_p_11213.h", ("rocrand_mtgp32_11213.h", CONV_INCLUDE, API_RAND)),
-    ("curand_normal.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_normal_static.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_philox4x32_x.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_poisson.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_precalc.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("curand_uniform.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
-    ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)),
-    ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
-    ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
-    ("<nccl.h>", ("<rccl.h>", CONV_INCLUDE, API_RUNTIME)), #PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
-    ("nvrtc.h", ("hip/hiprtc.h", CONV_INCLUDE, API_RTC)),
-    ("thrust/system/cuda", ("thrust/system/hip", CONV_INCLUDE, API_BLAS)),
-    ("cub/util_allocator.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-    ("cub/block/block_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-    ("cub/cub.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-    ("cub/block/block_load.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-    ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-    ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
-    ("nvToolsExt.h", ("roctx.h", CONV_INCLUDE, API_ROCTX)),
-])
-
-CUDA_IDENTIFIER_MAP = collections.OrderedDict([
-    ("__CUDACC__", ("__HIPCC__", CONV_DEF, API_RUNTIME)),
-    ("CUDA_ERROR_INVALID_CONTEXT", ("hipErrorInvalidContext", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_CONTEXT_ALREADY_CURRENT", ("hipErrorContextAlreadyCurrent", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_ARRAY_IS_MAPPED", ("hipErrorArrayIsMapped", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_ALREADY_MAPPED", ("hipErrorAlreadyMapped", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_ALREADY_ACQUIRED", ("hipErrorAlreadyAcquired", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_NOT_MAPPED", ("hipErrorNotMapped", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_NOT_MAPPED_AS_ARRAY", ("hipErrorNotMappedAsArray", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_NOT_MAPPED_AS_POINTER", ("hipErrorNotMappedAsPointer", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_CONTEXT_ALREADY_IN_USE", ("hipErrorContextAlreadyInUse", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_INVALID_SOURCE", ("hipErrorInvalidSource", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_FILE_NOT_FOUND", ("hipErrorFileNotFound", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_NOT_FOUND", ("hipErrorNotFound", CONV_TYPE, API_DRIVER)),
-    ("CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING", ("hipErrorLaunchIncompatibleTexturing", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE", ("hipErrorPrimaryContextActive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_CONTEXT_IS_DESTROYED", ("hipErrorContextIsDestroyed", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_NOT_PERMITTED", ("hipErrorNotPermitted", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_NOT_SUPPORTED", ("hipErrorNotSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorMissingConfiguration", ("hipErrorMissingConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorPriorLaunchFailure", ("hipErrorPriorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidDeviceFunction", ("hipErrorInvalidDeviceFunction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidConfiguration", ("hipErrorInvalidConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidPitchValue", ("hipErrorInvalidPitchValue", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidSymbol", ("hipErrorInvalidSymbol", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidHostPointer", ("hipErrorInvalidHostPointer", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidDevicePointer", ("hipErrorInvalidDevicePointer", CONV_TYPE, API_RUNTIME)),
-    ("cudaErrorInvalidTexture", ("hipErrorInvalidTexture", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidTextureBinding", ("hipErrorInvalidTextureBinding", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidChannelDescriptor", ("hipErrorInvalidChannelDescriptor", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidMemcpyDirection", ("hipErrorInvalidMemcpyDirection", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorAddressOfConstant", ("hipErrorAddressOfConstant", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorTextureFetchFailed", ("hipErrorTextureFetchFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorTextureNotBound", ("hipErrorTextureNotBound", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorSynchronizationError", ("hipErrorSynchronizationError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidFilterSetting", ("hipErrorInvalidFilterSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidNormSetting", ("hipErrorInvalidNormSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorMixedDeviceExecution", ("hipErrorMixedDeviceExecution", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorNotYetImplemented", ("hipErrorNotYetImplemented", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorMemoryValueTooLarge", ("hipErrorMemoryValueTooLarge", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInsufficientDriver", ("hipErrorInsufficientDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorSetOnActiveProcess", ("hipErrorSetOnActiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidSurface", ("hipErrorInvalidSurface", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorDuplicateVariableName", ("hipErrorDuplicateVariableName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorDuplicateTextureName", ("hipErrorDuplicateTextureName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorDuplicateSurfaceName", ("hipErrorDuplicateSurfaceName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorDevicesUnavailable", ("hipErrorDevicesUnavailable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorIncompatibleDriverContext", ("hipErrorIncompatibleDriverContext", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorDeviceAlreadyInUse", ("hipErrorDeviceAlreadyInUse", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorLaunchMaxDepthExceeded", ("hipErrorLaunchMaxDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorLaunchFileScopedTex", ("hipErrorLaunchFileScopedTex", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorLaunchFileScopedSurf", ("hipErrorLaunchFileScopedSurf", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorSyncDepthExceeded", ("hipErrorSyncDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorLaunchPendingCountExceeded", ("hipErrorLaunchPendingCountExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorNotPermitted", ("hipErrorNotPermitted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorNotSupported", ("hipErrorNotSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorStartupFailure", ("hipErrorStartupFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaErrorApiFailureBase", ("hipErrorApiFailureBase", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_SUCCESS", ("hipSuccess", CONV_TYPE, API_DRIVER)),
-    ("cudaSuccess", ("hipSuccess", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_INVALID_VALUE", ("hipErrorInvalidValue", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorInvalidValue", ("hipErrorInvalidValue", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_OUT_OF_MEMORY", ("hipErrorMemoryAllocation", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorMemoryAllocation", ("hipErrorMemoryAllocation", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_NOT_INITIALIZED", ("hipErrorNotInitialized", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorInitializationError", ("hipErrorInitializationError", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_DEINITIALIZED", ("hipErrorDeinitialized", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorCudartUnloading", ("hipErrorDeinitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_PROFILER_DISABLED", ("hipErrorProfilerDisabled", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorProfilerDisabled", ("hipErrorProfilerDisabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_PROFILER_NOT_INITIALIZED", ("hipErrorProfilerNotInitialized", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorProfilerNotInitialized", ("hipErrorProfilerNotInitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_PROFILER_ALREADY_STARTED", ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorProfilerAlreadyStarted", ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_PROFILER_ALREADY_STOPPED", ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorProfilerAlreadyStopped", ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_NO_DEVICE", ("hipErrorNoDevice", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorNoDevice", ("hipErrorNoDevice", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_INVALID_DEVICE", ("hipErrorInvalidDevice", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorInvalidDevice", ("hipErrorInvalidDevice", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_INVALID_IMAGE", ("hipErrorInvalidImage", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorInvalidKernelImage", ("hipErrorInvalidImage", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_MAP_FAILED", ("hipErrorMapFailed", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorMapBufferObjectFailed", ("hipErrorMapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_UNMAP_FAILED", ("hipErrorUnmapFailed", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorUnmapBufferObjectFailed", ("hipErrorUnmapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_NO_BINARY_FOR_GPU", ("hipErrorNoBinaryForGpu", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorNoKernelImageForDevice", ("hipErrorNoBinaryForGpu", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_ECC_UNCORRECTABLE", ("hipErrorECCNotCorrectable", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorECCUncorrectable", ("hipErrorECCNotCorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_UNSUPPORTED_LIMIT", ("hipErrorUnsupportedLimit", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorUnsupportedLimit", ("hipErrorUnsupportedLimit", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_PEER_ACCESS_UNSUPPORTED", ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorPeerAccessUnsupported", ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_INVALID_PTX", ("hipErrorInvalidKernelFile", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorInvalidPtx", ("hipErrorInvalidKernelFile", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_INVALID_GRAPHICS_CONTEXT", ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorInvalidGraphicsContext", ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_NVLINK_UNCORRECTABLE", ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorNvlinkUncorrectable", ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND", ("hipErrorSharedObjectSymbolNotFound", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorSharedObjectSymbolNotFound", ("hipErrorSharedObjectSymbolNotFound", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_SHARED_OBJECT_INIT_FAILED", ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorSharedObjectInitFailed", ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_OPERATING_SYSTEM", ("hipErrorOperatingSystem", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorOperatingSystem", ("hipErrorOperatingSystem", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_INVALID_HANDLE", ("hipErrorInvalidResourceHandle", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorInvalidResourceHandle", ("hipErrorInvalidResourceHandle", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_NOT_READY", ("hipErrorNotReady", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorNotReady", ("hipErrorNotReady", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_ILLEGAL_ADDRESS", ("hipErrorIllegalAddress", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorIllegalAddress", ("hipErrorIllegalAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES", ("hipErrorLaunchOutOfResources", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorLaunchOutOfResources", ("hipErrorLaunchOutOfResources", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_LAUNCH_TIMEOUT", ("hipErrorLaunchTimeOut", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorLaunchTimeout", ("hipErrorLaunchTimeOut", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED", ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorPeerAccessAlreadyEnabled", ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_PEER_ACCESS_NOT_ENABLED", ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorPeerAccessNotEnabled", ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_ASSERT", ("hipErrorAssert", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorAssert", ("hipErrorAssert", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_TOO_MANY_PEERS", ("hipErrorTooManyPeers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorTooManyPeers", ("hipErrorTooManyPeers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED", ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorHostMemoryAlreadyRegistered", ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED", ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_DRIVER)),
-    ("cudaErrorHostMemoryNotRegistered", ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_RUNTIME)),
-    ("CUDA_ERROR_HARDWARE_STACK_ERROR", ("hipErrorHardwareStackError", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorHardwareStackError", ("hipErrorHardwareStackError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_ILLEGAL_INSTRUCTION", ("hipErrorIllegalInstruction", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorIllegalInstruction", ("hipErrorIllegalInstruction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_MISALIGNED_ADDRESS", ("hipErrorMisalignedAddress", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorMisalignedAddress", ("hipErrorMisalignedAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_INVALID_ADDRESS_SPACE", ("hipErrorInvalidAddressSpace", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidAddressSpace", ("hipErrorInvalidAddressSpace", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_INVALID_PC", ("hipErrorInvalidPc", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorInvalidPc", ("hipErrorInvalidPc", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_LAUNCH_FAILED", ("hipErrorLaunchFailure", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorLaunchFailure", ("hipErrorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_ERROR_UNKNOWN", ("hipErrorUnknown", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaErrorUnknown", ("hipErrorUnknown", CONV_TYPE, API_RUNTIME)),
-    ("CU_TR_ADDRESS_MODE_WRAP", ("HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TR_ADDRESS_MODE_CLAMP", ("HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TR_ADDRESS_MODE_MIRROR", ("HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TR_ADDRESS_MODE_BORDER", ("HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CUBEMAP_FACE_POSITIVE_X", ("HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CUBEMAP_FACE_NEGATIVE_X", ("HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CUBEMAP_FACE_POSITIVE_Y", ("HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CUBEMAP_FACE_NEGATIVE_Y", ("HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CUBEMAP_FACE_POSITIVE_Z", ("HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CUBEMAP_FACE_NEGATIVE_Z", ("HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_AD_FORMAT_UNSIGNED_INT8", ("HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER)),
-    ("CU_AD_FORMAT_UNSIGNED_INT16", ("HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER)),
-    ("CU_AD_FORMAT_UNSIGNED_INT32", ("HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER)),
-    ("CU_AD_FORMAT_SIGNED_INT8", ("HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER)),
-    ("CU_AD_FORMAT_SIGNED_INT16", ("HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER)),
-    ("CU_AD_FORMAT_SIGNED_INT32", ("HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER)),
-    ("CU_AD_FORMAT_HALF", ("HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER)),
-    ("CU_AD_FORMAT_FLOAT", ("HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER)),
-    ("CU_COMPUTEMODE_DEFAULT", ("hipComputeModeDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_COMPUTEMODE_EXCLUSIVE", ("hipComputeModeExclusive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_COMPUTEMODE_PROHIBITED", ("hipComputeModeProhibited", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_COMPUTEMODE_EXCLUSIVE_PROCESS", ("hipComputeModeExclusiveProcess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ADVISE_SET_READ_MOSTLY", ("hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ADVISE_UNSET_READ_MOSTLY", ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ADVISE_SET_PREFERRED_LOCATION", ("hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION", ("hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ADVISE_SET_ACCESSED_BY", ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ADVISE_UNSET_ACCESSED_BY", ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY", ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION", ("hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY", ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION", ("hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_SCHED_AUTO", ("HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_SCHED_SPIN", ("HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_SCHED_YIELD", ("HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_SCHED_BLOCKING_SYNC", ("HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_BLOCKING_SYNC", ("HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_SCHED_MASK", ("HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_MAP_HOST", ("HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_LMEM_RESIZE_TO_MAX", ("HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_CTX_FLAGS_MASK", ("HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_LAUNCH_PARAM_BUFFER_POINTER", ("HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_TYPE, API_DRIVER)),
-    ("CU_LAUNCH_PARAM_BUFFER_SIZE", ("HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_TYPE, API_DRIVER)),
-    ("CU_LAUNCH_PARAM_END", ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER)),
-    ("CU_IPC_HANDLE_SIZE", ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMHOSTALLOC_DEVICEMAP", ("HIP_MEMHOSTALLOC_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMHOSTALLOC_PORTABLE", ("HIP_MEMHOSTALLOC_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMHOSTALLOC_WRITECOMBINED", ("HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMHOSTREGISTER_DEVICEMAP", ("HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMHOSTREGISTER_IOMEMORY", ("HIP_MEMHOSTREGISTER_IOMEMORY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMHOSTREGISTER_PORTABLE", ("HIP_MEMHOSTREGISTER_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_PARAM_TR_DEFAULT", ("HIP_PARAM_TR_DEFAULT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_LEGACY", ("HIP_STREAM_LEGACY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_PER_THREAD", ("HIP_STREAM_PER_THREAD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TRSA_OVERRIDE_FORMAT", ("HIP_TRSA_OVERRIDE_FORMAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TRSF_NORMALIZED_COORDINATES", ("HIP_TRSF_NORMALIZED_COORDINATES", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TRSF_READ_AS_INTEGER", ("HIP_TRSF_READ_AS_INTEGER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TRSF_SRGB", ("HIP_TRSF_SRGB", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ARRAY3D_2DARRAY", ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ARRAY3D_CUBEMAP", ("HIP_ARRAY3D_CUBEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ARRAY3D_DEPTH_TEXTURE", ("HIP_ARRAY3D_DEPTH_TEXTURE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ARRAY3D_LAYERED", ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ARRAY3D_SURFACE_LDST", ("HIP_ARRAY3D_SURFACE_LDST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_ARRAY3D_TEXTURE_GATHER", ("HIP_ARRAY3D_TEXTURE_GATHER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK", ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X", ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y", ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z", ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X", ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y", ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z", ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK", ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK", ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY", ("hipDeviceAttributeTotalConstantMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_WARP_SIZE", ("hipDeviceAttributeWarpSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_PITCH", ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK", ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK", ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_CLOCK_RATE", ("hipDeviceAttributeClockRate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT", ("hipDeviceAttributeTextureAlignment", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_GPU_OVERLAP", ("hipDeviceAttributeAsyncEngineCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT", ("hipDeviceAttributeMultiprocessorCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT", ("hipDeviceAttributeKernelExecTimeout", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_INTEGRATED", ("hipDeviceAttributeIntegrated", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY", ("hipDeviceAttributeCanMapHostMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_COMPUTE_MODE", ("hipDeviceAttributeComputeMode", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH", ("hipDeviceAttributeMaxTexture1DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH", ("hipDeviceAttributeMaxTexture2DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT", ("hipDeviceAttributeMaxTexture2DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH", ("hipDeviceAttributeMaxTexture3DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT", ("hipDeviceAttributeMaxTexture3DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH", ("hipDeviceAttributeMaxTexture3DDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH", ("hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT", ("hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS", ("hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH", ("hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT", ("hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES", ("hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT", ("hipDeviceAttributeSurfaceAlignment", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS", ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_ECC_ENABLED", ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_PCI_BUS_ID", ("hipDeviceAttributePciBusId", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID", ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_TCC_DRIVER", ("hipDeviceAttributeTccDriver", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE", ("hipDeviceAttributeMemoryClockRate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH", ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE", ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR", ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT", ("hipDeviceAttributeAsyncEngineCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING", ("hipDeviceAttributeUnifiedAddressing", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH", ("hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS", ("hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER", ("hipDeviceAttributeCanTex2DGather", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH", ("hipDeviceAttributeMaxTexture2DGatherWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT", ("hipDeviceAttributeMaxTexture2DGatherHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE", ("hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE", ("hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE", ("hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID", ("hipDeviceAttributePciDomainId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT", ("hipDeviceAttributeTexturePitchAlignment", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH", ("hipDeviceAttributeMaxTextureCubemapWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH", ("hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS", ("hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH", ("hipDeviceAttributeMaxSurface1DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH", ("hipDeviceAttributeMaxSurface2DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT", ("hipDeviceAttributeMaxSurface2DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH", ("hipDeviceAttributeMaxSurface3DWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT", ("hipDeviceAttributeMaxSurface3DHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH", ("hipDeviceAttributeMaxSurface3DDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH", ("hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS", ("hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH", ("hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT", ("hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS", ("hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH", ("hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH", ("hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS", ("hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH", ("hipDeviceAttributeMaxTexture1DLinearWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH", ("hipDeviceAttributeMaxTexture2DLinearWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT", ("hipDeviceAttributeMaxTexture2DLinearHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH", ("hipDeviceAttributeMaxTexture2DLinearPitch", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH", ("hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT", ("hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR", ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR", ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH", ("hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED", ("hipDeviceAttributeStreamPrioritiesSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED", ("hipDeviceAttributeGlobalL1CacheSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED", ("hipDeviceAttributeLocalL1CacheSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR", ("hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR", ("hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY", ("hipDeviceAttributeManagedMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD", ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_DRIVER)),
-    ("CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID", ("hipDeviceAttributeMultiGpuBoardGroupId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED", ("hipDeviceAttributeHostNativeAtomicSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO", ("hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS", ("hipDeviceAttributePageableMemoryAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS", ("hipDeviceAttributeConcurrentManagedAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED", ("hipDeviceAttributeComputePreemptionSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM", ("hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_ATTRIBUTE_MAX", ("hipDeviceAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_CONTEXT", ("hipPointerAttributeContext", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_MEMORY_TYPE", ("hipPointerAttributeMemoryType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_DEVICE_POINTER", ("hipPointerAttributeDevicePointer", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_HOST_POINTER", ("hipPointerAttributeHostPointer", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_P2P_TOKENS", ("hipPointerAttributeP2pTokens", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_SYNC_MEMOPS", ("hipPointerAttributeSyncMemops", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_BUFFER_ID", ("hipPointerAttributeBufferId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_POINTER_ATTRIBUTE_IS_MANAGED", ("hipPointerAttributeIsManaged", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK", ("hipFuncAttributeMaxThreadsPerBlocks", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES", ("hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES", ("hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES", ("hipFuncAttributeLocalSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_NUM_REGS", ("hipFuncAttributeNumRegs", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_PTX_VERSION", ("hipFuncAttributePtxVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_BINARY_VERSION", ("hipFuncAttributeBinaryVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_CACHE_MODE_CA", ("hipFuncAttributeCacheModeCA", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_ATTRIBUTE_MAX", ("hipFuncAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE", ("hipGraphicsMapFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY", ("hipGraphicsMapFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD", ("hipGraphicsMapFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_REGISTER_FLAGS_NONE", ("hipGraphicsRegisterFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY", ("hipGraphicsRegisterFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD", ("hipGraphicsRegisterFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST", ("hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER", ("hipGraphicsRegisterFlagsTextureGather", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_OCCUPANCY_DEFAULT", ("hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE", ("hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_FUNC_CACHE_PREFER_NONE", ("hipFuncCachePreferNone", CONV_CACHE, API_DRIVER)),
-    ("CU_FUNC_CACHE_PREFER_SHARED", ("hipFuncCachePreferShared", CONV_CACHE, API_DRIVER)),
-    ("CU_FUNC_CACHE_PREFER_L1", ("hipFuncCachePreferL1", CONV_CACHE, API_DRIVER)),
-    ("CU_FUNC_CACHE_PREFER_EQUAL", ("hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER)),
-    ("CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS", ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CUDA_IPC_HANDLE_SIZE", ("HIP_IPC_HANDLE_SIZE", CONV_TYPE, API_DRIVER)),
-    ("CU_JIT_CACHE_OPTION_NONE", ("hipJitCacheModeOptionNone", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_CACHE_OPTION_CG", ("hipJitCacheModeOptionCG", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_CACHE_OPTION_CA", ("hipJitCacheModeOptionCA", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_PREFER_PTX", ("hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_PREFER_BINARY", ("hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_MAX_REGISTERS", ("hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_THREADS_PER_BLOCK", ("hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_WALL_TIME", ("hipJitOptionWallTime", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_INFO_LOG_BUFFER", ("hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES", ("hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_ERROR_LOG_BUFFER", ("hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES", ("hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_OPTIMIZATION_LEVEL", ("hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_TARGET_FROM_CUCONTEXT", ("hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_TARGET", ("hipJitOptionTarget", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_FALLBACK_STRATEGY", ("hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_GENERATE_DEBUG_INFO", ("hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_LOG_VERBOSE", ("hipJitOptionLogVerbose", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_GENERATE_LINE_INFO", ("hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_CACHE_MODE", ("hipJitOptionCacheMode", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_NEW_SM3X_OPT", ("hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_FAST_COMPILE", ("hipJitOptionFastCompile", CONV_JIT, API_DRIVER)),
-    ("CU_JIT_NUM_OPTIONS", ("hipJitOptionNumOptions", CONV_JIT, API_DRIVER)),
-    ("CU_TARGET_COMPUTE_10", ("hipJitTargetCompute10", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_11", ("hipJitTargetCompute11", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_12", ("hipJitTargetCompute12", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_13", ("hipJitTargetCompute13", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_20", ("hipJitTargetCompute20", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_21", ("hipJitTargetCompute21", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_30", ("hipJitTargetCompute30", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_32", ("hipJitTargetCompute32", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_35", ("hipJitTargetCompute35", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_37", ("hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_50", ("hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_52", ("hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_53", ("hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_60", ("hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_61", ("hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TARGET_COMPUTE_62", ("hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_INPUT_CUBIN", ("hipJitInputTypeBin", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_INPUT_PTX", ("hipJitInputTypePtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_INPUT_FATBINARY", ("hipJitInputTypeFatBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_INPUT_OBJECT", ("hipJitInputTypeObject", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_INPUT_LIBRARY", ("hipJitInputTypeLibrary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_JIT_NUM_INPUT_TYPES", ("hipJitInputTypeNumInputTypes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_LIMIT_STACK_SIZE", ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_LIMIT_PRINTF_FIFO_SIZE", ("hipLimitPrintfFifoSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_LIMIT_MALLOC_HEAP_SIZE", ("hipLimitMallocHeapSize", CONV_TYPE, API_DRIVER)),
-    ("CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH", ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT", ("hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_LIMIT_STACK_SIZE", ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ATTACH_GLOBAL", ("hipMemAttachGlobal", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ATTACH_HOST", ("hipMemAttachHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEM_ATTACH_SINGLE", ("hipMemAttachSingle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMORYTYPE_HOST", ("hipMemTypeHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMORYTYPE_DEVICE", ("hipMemTypeDevice", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMORYTYPE_ARRAY", ("hipMemTypeArray", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_MEMORYTYPE_UNIFIED", ("hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_RESOURCE_TYPE_ARRAY", ("hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_RESOURCE_TYPE_MIPMAPPED_ARRAY", ("hipResourceTypeMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_RESOURCE_TYPE_LINEAR", ("hipResourceTypeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_RESOURCE_TYPE_PITCH2D", ("hipResourceTypePitch2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_RES_VIEW_FORMAT_NONE", ("hipResViewFormatNone", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_1X8", ("hipResViewFormatUnsignedChar1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_2X8", ("hipResViewFormatUnsignedChar2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_4X8", ("hipResViewFormatUnsignedChar4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_1X8", ("hipResViewFormatSignedChar1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_2X8", ("hipResViewFormatSignedChar2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_4X8", ("hipResViewFormatSignedChar4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_1X16", ("hipResViewFormatUnsignedShort1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_2X16", ("hipResViewFormatUnsignedShort2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_4X16", ("hipResViewFormatUnsignedShort4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_1X16", ("hipResViewFormatSignedShort1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_2X16", ("hipResViewFormatSignedShort2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_4X16", ("hipResViewFormatSignedShort4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_1X32", ("hipResViewFormatUnsignedInt1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_2X32", ("hipResViewFormatUnsignedInt2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UINT_4X32", ("hipResViewFormatUnsignedInt4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_1X32", ("hipResViewFormatSignedInt1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_2X32", ("hipResViewFormatSignedInt2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SINT_4X32", ("hipResViewFormatSignedInt4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_FLOAT_1X16", ("hipResViewFormatHalf1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_FLOAT_2X16", ("hipResViewFormatHalf2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_FLOAT_4X16", ("hipResViewFormatHalf4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_FLOAT_1X32", ("hipResViewFormatFloat1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_FLOAT_2X32", ("hipResViewFormatFloat2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_FLOAT_4X32", ("hipResViewFormatFloat4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UNSIGNED_BC1", ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UNSIGNED_BC2", ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UNSIGNED_BC3", ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UNSIGNED_BC4", ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SIGNED_BC4", ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UNSIGNED_BC5", ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SIGNED_BC5", ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UNSIGNED_BC6H", ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_SIGNED_BC6H", ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER)),
-    ("CU_RES_VIEW_FORMAT_UNSIGNED_BC7", ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER)),
-    ("CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE", ("hipSharedMemBankSizeDefault", CONV_TYPE, API_DRIVER)),
-    ("CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE", ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_DRIVER)),
-    ("CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE", ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_DRIVER)),
-    ("CU_STREAM_DEFAULT", ("hipStreamDefault", CONV_TYPE, API_DRIVER)),
-    ("CU_STREAM_NON_BLOCKING", ("hipStreamNonBlocking", CONV_TYPE, API_DRIVER)),
-    ("CU_STREAM_WAIT_VALUE_GEQ", ("hipStreamWaitValueGeq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_WAIT_VALUE_EQ", ("hipStreamWaitValueEq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_WAIT_VALUE_AND", ("hipStreamWaitValueAnd", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_WAIT_VALUE_FLUSH", ("hipStreamWaitValueFlush", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_WRITE_VALUE_DEFAULT", ("hipStreamWriteValueDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER", ("hipStreamWriteValueNoMemoryBarrier", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_MEM_OP_WAIT_VALUE_32", ("hipStreamBatchMemOpWaitValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_MEM_OP_WRITE_VALUE_32", ("hipStreamBatchMemOpWriteValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES", ("hipStreamBatchMemOpFlushRemoteWrites", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGetErrorName", ("hipGetErrorName___", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGetErrorString", ("hipGetErrorString___", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuInit", ("hipInit", CONV_INIT, API_DRIVER)),
-    ("cuDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_DRIVER)),
-    ("cuCtxCreate_v2", ("hipCtxCreate", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxDestroy_v2", ("hipCtxDestroy", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxGetApiVersion", ("hipCtxGetApiVersion", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxGetCacheConfig", ("hipCtxGetCacheConfig", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxGetCurrent", ("hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxGetDevice", ("hipCtxGetDevice", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxGetFlags", ("hipCtxGetFlags", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxGetLimit", ("hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuCtxGetSharedMemConfig", ("hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxGetStreamPriorityRange", ("hipCtxGetStreamPriorityRange", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuCtxPopCurrent_v2", ("hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxPushCurrent_v2", ("hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxSetCacheConfig", ("hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxSetCurrent", ("hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxSetLimit", ("hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuCtxSetSharedMemConfig", ("hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxSynchronize", ("hipCtxSynchronize", CONV_CONTEXT, API_DRIVER)),
-    ("cuCtxAttach", ("hipCtxAttach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuCtxDetach", ("hipCtxDetach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuCtxEnablePeerAccess", ("hipCtxEnablePeerAccess", CONV_PEER, API_DRIVER)),
-    ("cuCtxDisablePeerAccess", ("hipCtxDisablePeerAccess", CONV_PEER, API_DRIVER)),
-    ("cuDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_DRIVER)),
-    ("cuDeviceGetP2PAttribute", ("hipDeviceGetP2PAttribute", CONV_PEER, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuDevicePrimaryCtxGetState", ("hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER)),
-    ("cuDevicePrimaryCtxRelease", ("hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER)),
-    ("cuDevicePrimaryCtxReset", ("hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER)),
-    ("cuDevicePrimaryCtxRetain", ("hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER)),
-    ("cuDevicePrimaryCtxSetFlags", ("hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER)),
-    ("cuDeviceGet", ("hipGetDevice", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceGetName", ("hipDeviceGetName", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceGetCount", ("hipGetDeviceCount", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceGetByPCIBusId", ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceTotalMem_v2", ("hipDeviceTotalMem", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceComputeCapability", ("hipDeviceComputeCapability", CONV_DEVICE, API_DRIVER)),
-    ("cuDeviceGetProperties", ("hipGetDeviceProperties", CONV_DEVICE, API_DRIVER)),
-    ("cuLinkAddData", ("hipLinkAddData", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLinkAddFile", ("hipLinkAddFile", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLinkComplete", ("hipLinkComplete", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLinkCreate", ("hipLinkCreate", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLinkDestroy", ("hipLinkDestroy", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuModuleGetFunction", ("hipModuleGetFunction", CONV_MODULE, API_DRIVER)),
-    ("cuModuleGetGlobal_v2", ("hipModuleGetGlobal", CONV_MODULE, API_DRIVER)),
-    ("cuModuleGetSurfRef", ("hipModuleGetSurfRef", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuModuleGetTexRef", ("hipModuleGetTexRef", CONV_MODULE, API_DRIVER)),
-    ("cuModuleLoad", ("hipModuleLoad", CONV_MODULE, API_DRIVER)),
-    ("cuModuleLoadData", ("hipModuleLoadData", CONV_MODULE, API_DRIVER)),
-    ("cuModuleLoadDataEx", ("hipModuleLoadDataEx", CONV_MODULE, API_DRIVER)),
-    ("cuModuleLoadFatBinary", ("hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuModuleUnload", ("hipModuleUnload", CONV_MODULE, API_DRIVER)),
-    ("CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK", ("hipDeviceP2PAttributePerformanceRank", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED", ("hipDeviceP2PAttributeAccessSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED", ("hipDeviceP2PAttributeNativeAtomicSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_EVENT_DEFAULT", ("hipEventDefault", CONV_EVENT, API_DRIVER)),
-    ("CU_EVENT_BLOCKING_SYNC", ("hipEventBlockingSync", CONV_EVENT, API_DRIVER)),
-    ("CU_EVENT_DISABLE_TIMING", ("hipEventDisableTiming", CONV_EVENT, API_DRIVER)),
-    ("CU_EVENT_INTERPROCESS", ("hipEventInterprocess", CONV_EVENT, API_DRIVER)),
-    ("cuEventCreate", ("hipEventCreate", CONV_EVENT, API_DRIVER)),
-    ("cuEventDestroy_v2", ("hipEventDestroy", CONV_EVENT, API_DRIVER)),
-    ("cuEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_DRIVER)),
-    ("cuEventQuery", ("hipEventQuery", CONV_EVENT, API_DRIVER)),
-    ("cuEventRecord", ("hipEventRecord", CONV_EVENT, API_DRIVER)),
-    ("cuEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_DRIVER)),
-    ("cuFuncGetAttribute", ("hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER)),
-    ("cuFuncSetSharedMemConfig", ("hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLaunchKernel", ("hipModuleLaunchKernel", CONV_MODULE, API_DRIVER)),
-    ("cuFuncSetBlockShape", ("hipFuncSetBlockShape", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuFuncSetSharedSize", ("hipFuncSetSharedSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLaunch", ("hipLaunch", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLaunchGrid", ("hipLaunchGrid", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuLaunchGridAsync", ("hipLaunchGridAsync", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuParamSetf", ("hipParamSetf", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuParamSeti", ("hipParamSeti", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuParamSetSize", ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuParamSetSize", ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuParamSetv", ("hipParamSetv", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuOccupancyMaxActiveBlocksPerMultiprocessor", ("hipOccupancyMaxActiveBlocksPerMultiprocessor", CONV_OCCUPANCY, API_DRIVER)),
-    ("cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", ("hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", CONV_OCCUPANCY, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuOccupancyMaxPotentialBlockSize", ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER)),
-    ("cuOccupancyMaxPotentialBlockSizeWithFlags", ("hipOccupancyMaxPotentialBlockSizeWithFlags", CONV_OCCUPANCY, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_DRIVER)),
-    ("cuStreamAttachMemAsync", ("hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuStreamCreate", ("hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuStreamCreateWithPriority", ("hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuStreamDestroy_v2", ("hipStreamDestroy", CONV_STREAM, API_DRIVER)),
-    ("cuStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_DRIVER)),
-    ("cuStreamGetPriority", ("hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuStreamQuery", ("hipStreamQuery", CONV_STREAM, API_DRIVER)),
-    ("cuStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_DRIVER)),
-    ("cuStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_DRIVER)),
-    ("cuStreamWaitValue32", ("hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuStreamWriteValue32", ("hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuStreamBatchMemOp", ("hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuArray3DCreate", ("hipArray3DCreate", CONV_MEM, API_DRIVER)),
-    ("cuArray3DGetDescriptor", ("hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuArrayCreate", ("hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuArrayDestroy", ("hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuArrayGetDescriptor", ("hipArrayGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuIpcCloseMemHandle", ("hipIpcCloseMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuIpcGetEventHandle", ("hipIpcGetEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuIpcGetMemHandle", ("hipIpcGetMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuIpcOpenEventHandle", ("hipIpcOpenEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuIpcOpenMemHandle", ("hipIpcOpenMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemAlloc_v2", ("hipMalloc", CONV_MEM, API_DRIVER)),
-    ("cuMemAllocHost", ("hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemAllocManaged", ("hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemAllocPitch", ("hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy", ("hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy2D", ("hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy2DAsync", ("hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy2DUnaligned", ("hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy3D", ("hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy3DAsync", ("hipMemcpy3DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy3DPeer", ("hipMemcpy3DPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpy3DPeerAsync", ("hipMemcpy3DPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyAsync", ("hipMemcpyAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyAtoA", ("hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyAtoD", ("hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyAtoH", ("hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyAtoHAsync", ("hipMemcpyAtoHAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyDtoA", ("hipMemcpyDtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyDtoD_v2", ("hipMemcpyDtoD", CONV_MEM, API_DRIVER)),
-    ("cuMemcpyDtoDAsync_v2", ("hipMemcpyDtoDAsync", CONV_MEM, API_DRIVER)),
-    ("cuMemcpyDtoH_v2", ("hipMemcpyDtoH", CONV_MEM, API_DRIVER)),
-    ("cuMemcpyDtoHAsync_v2", ("hipMemcpyDtoHAsync", CONV_MEM, API_DRIVER)),
-    ("cuMemcpyHtoA", ("hipMemcpyHtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyHtoAAsync", ("hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyHtoD_v2", ("hipMemcpyHtoD", CONV_MEM, API_DRIVER)),
-    ("cuMemcpyHtoDAsync_v2", ("hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER)),
-    ("cuMemcpyPeerAsync", ("hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemcpyPeer", ("hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemFree_v2", ("hipFree", CONV_MEM, API_DRIVER)),
-    ("cuMemFreeHost", ("hipHostFree", CONV_MEM, API_DRIVER)),
-    ("cuMemGetAddressRange", ("hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemGetInfo_v2", ("hipMemGetInfo", CONV_MEM, API_DRIVER)),
-    ("cuMemHostAlloc", ("hipHostMalloc", CONV_MEM, API_DRIVER)),
-    ("cuMemHostGetDevicePointer", ("hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemHostGetFlags", ("hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemHostRegister_v2", ("hipHostRegister", CONV_MEM, API_DRIVER)),
-    ("cuMemHostUnregister", ("hipHostUnregister", CONV_MEM, API_DRIVER)),
-    ("cuMemsetD16_v2", ("hipMemsetD16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD16Async", ("hipMemsetD16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD2D16_v2", ("hipMemsetD2D16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD2D16Async", ("hipMemsetD2D16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD2D32_v2", ("hipMemsetD2D32", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD2D32Async", ("hipMemsetD2D32Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD2D8_v2", ("hipMemsetD2D8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD2D8Async", ("hipMemsetD2D8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD32_v2", ("hipMemset", CONV_MEM, API_DRIVER)),
-    ("cuMemsetD32Async", ("hipMemsetAsync", CONV_MEM, API_DRIVER)),
-    ("cuMemsetD8_v2", ("hipMemsetD8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemsetD8Async", ("hipMemsetD8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMipmappedArrayCreate", ("hipMipmappedArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMipmappedArrayDestroy", ("hipMipmappedArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMipmappedArrayGetLevel", ("hipMipmappedArrayGetLevel", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemPrefetchAsync", ("hipMemPrefetchAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemAdvise", ("hipMemAdvise", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemRangeGetAttribute", ("hipMemRangeGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuMemRangeGetAttributes", ("hipMemRangeGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuPointerGetAttribute", ("hipPointerGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuPointerGetAttributes", ("hipPointerGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuPointerSetAttribute", ("hipPointerSetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_TR_FILTER_MODE_POINT", ("hipFilterModePoint", CONV_TEX, API_DRIVER)),
-    ("CU_TR_FILTER_MODE_LINEAR", ("hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetAddress", ("hipTexRefGetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetAddressMode", ("hipTexRefGetAddressMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetArray", ("hipTexRefGetArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetBorderColor", ("hipTexRefGetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetFilterMode", ("hipTexRefGetFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetFlags", ("hipTexRefGetFlags", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetFormat", ("hipTexRefGetFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetMaxAnisotropy", ("hipTexRefGetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetMipmapFilterMode", ("hipTexRefGetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetMipmapLevelBias", ("hipTexRefGetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetMipmapLevelClamp", ("hipTexRefGetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefGetMipmappedArray", ("hipTexRefGetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetAddress", ("hipTexRefSetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetAddress2D", ("hipTexRefSetAddress2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetAddressMode", ("hipTexRefSetAddressMode", CONV_TEX, API_DRIVER)),
-    ("cuTexRefSetArray", ("hipTexRefSetArray", CONV_TEX, API_DRIVER)),
-    ("cuTexRefSetBorderColor", ("hipTexRefSetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetFilterMode", ("hipTexRefSetFilterMode", CONV_TEX, API_DRIVER)),
-    ("cuTexRefSetFlags", ("hipTexRefSetFlags", CONV_TEX, API_DRIVER)),
-    ("cuTexRefSetFormat", ("hipTexRefSetFormat", CONV_TEX, API_DRIVER)),
-    ("cuTexRefSetMaxAnisotropy", ("hipTexRefSetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetMipmapFilterMode", ("hipTexRefSetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetMipmapLevelBias", ("hipTexRefSetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetMipmapLevelClamp", ("hipTexRefSetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefSetMipmappedArray", ("hipTexRefSetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefCreate", ("hipTexRefCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexRefDestroy", ("hipTexRefDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuSurfRefGetArray", ("hipSurfRefGetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuSurfRefSetArray", ("hipSurfRefSetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexObjectCreate", ("hipTexObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexObjectDestroy", ("hipTexObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexObjectGetResourceDesc", ("hipTexObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexObjectGetResourceViewDesc", ("hipTexObjectGetResourceViewDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuTexObjectGetTextureDesc", ("hipTexObjectGetTextureDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuSurfObjectCreate", ("hipSurfObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuSurfObjectDestroy", ("hipSurfObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuSurfObjectGetResourceDesc", ("hipSurfObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsMapResources", ("hipGraphicsMapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsResourceGetMappedMipmappedArray", ("hipGraphicsResourceGetMappedMipmappedArray", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsResourceGetMappedPointer", ("hipGraphicsResourceGetMappedPointer", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsResourceSetMapFlags", ("hipGraphicsResourceSetMapFlags", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsSubResourceGetMappedArray", ("hipGraphicsSubResourceGetMappedArray", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsUnmapResources", ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsUnregisterResource", ("hipGraphicsUnregisterResource", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuProfilerInitialize", ("hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuProfilerStart", ("hipProfilerStart", CONV_OTHER, API_DRIVER)),
-    ("cuProfilerStop", ("hipProfilerStop", CONV_OTHER, API_DRIVER)),
-    ("CU_GL_DEVICE_LIST_ALL", ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GL_DEVICE_LIST_CURRENT_FRAME", ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GL_DEVICE_LIST_NEXT_FRAME", ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLGetDevices", ("hipGLGetDevices", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsGLRegisterBuffer", ("hipGraphicsGLRegisterBuffer", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsGLRegisterImage", ("hipGraphicsGLRegisterImage", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuWGLGetDevice", ("hipWGLGetDevice", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GL_MAP_RESOURCE_FLAGS_NONE", ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY", ("HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", ("HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLCtxCreate", ("hipGLCtxCreate", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLInit", ("hipGLInit", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLMapBufferObject", ("hipGLMapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLMapBufferObjectAsync", ("hipGLMapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLRegisterBufferObject", ("hipGLRegisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLSetBufferObjectMapFlags", ("hipGLSetBufferObjectMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLUnmapBufferObject", ("hipGLUnmapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLUnmapBufferObjectAsync", ("hipGLUnmapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGLUnregisterBufferObject", ("hipGLUnregisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_DEVICE_LIST_ALL", ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_DEVICE_LIST_CURRENT_FRAME", ("HIP_D3D9_DEVICE_LIST_CURRENT_FRAME", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_DEVICE_LIST_NEXT_FRAME", ("HIP_D3D9_DEVICE_LIST_NEXT_FRAME", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9CtxCreate", ("hipD3D9CtxCreate", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9CtxCreateOnDevice", ("hipD3D9CtxCreateOnDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9GetDevice", ("hipD3D9GetDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9GetDevices", ("hipD3D9GetDevices", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9GetDirect3DDevice", ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsD3D9RegisterResource", ("hipGraphicsD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_MAPRESOURCE_FLAGS_NONE", ("HIP_D3D9_MAPRESOURCE_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_MAPRESOURCE_FLAGS_READONLY", ("HIP_D3D9_MAPRESOURCE_FLAGS_READONLY", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", ("HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_REGISTER_FLAGS_NONE", ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D9_REGISTER_FLAGS_ARRAY", ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9MapResources", ("hipD3D9MapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9RegisterResource", ("hipD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9ResourceGetMappedArray", ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9ResourceGetMappedPitch", ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9ResourceGetMappedPointer", ("hipD3D9ResourceGetMappedPointer", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9ResourceGetMappedSize", ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9ResourceGetSurfaceDimensions", ("hipD3D9ResourceGetSurfaceDimensions", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9ResourceSetMapFlags", ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9UnmapResources", ("hipD3D9UnmapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D9UnregisterResource", ("hipD3D9UnregisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_DEVICE_LIST_ALL", ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_DEVICE_LIST_CURRENT_FRAME", ("HIP_D3D10_DEVICE_LIST_CURRENT_FRAME", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_DEVICE_LIST_NEXT_FRAME", ("HIP_D3D10_DEVICE_LIST_NEXT_FRAME", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10GetDevice", ("hipD3D10GetDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10GetDevices", ("hipD3D10GetDevices", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsD3D10RegisterResource", ("hipGraphicsD3D10RegisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_MAPRESOURCE_FLAGS_NONE", ("HIP_D3D10_MAPRESOURCE_FLAGS_NONE", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_MAPRESOURCE_FLAGS_READONLY", ("HIP_D3D10_MAPRESOURCE_FLAGS_READONLY", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", ("HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_REGISTER_FLAGS_NONE", ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D10_REGISTER_FLAGS_ARRAY", ("HIP_D3D10_REGISTER_FLAGS_ARRAY", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10CtxCreate", ("hipD3D10CtxCreate", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10CtxCreateOnDevice", ("hipD3D10CtxCreateOnDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10GetDirect3DDevice", ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10MapResources", ("hipD3D10MapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10RegisterResource", ("hipD3D10RegisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10ResourceGetMappedArray", ("hipD3D10ResourceGetMappedArray", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10ResourceGetMappedPitch", ("hipD3D10ResourceGetMappedPitch", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10ResourceGetMappedPointer", ("hipD3D10ResourceGetMappedPointer", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10ResourceGetMappedSize", ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10ResourceGetSurfaceDimensions", ("hipD3D10ResourceGetSurfaceDimensions", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD310ResourceSetMapFlags", ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10UnmapResources", ("hipD3D10UnmapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D10UnregisterResource", ("hipD3D10UnregisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D11_DEVICE_LIST_ALL", ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D11_DEVICE_LIST_CURRENT_FRAME", ("HIP_D3D11_DEVICE_LIST_CURRENT_FRAME", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("CU_D3D11_DEVICE_LIST_NEXT_FRAME", ("HIP_D3D11_DEVICE_LIST_NEXT_FRAME", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D11GetDevice", ("hipD3D11GetDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D11GetDevices", ("hipD3D11GetDevices", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsD3D11RegisterResource", ("hipGraphicsD3D11RegisterResource", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D11CtxCreate", ("hipD3D11CtxCreate", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D11CtxCreateOnDevice", ("hipD3D11CtxCreateOnDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuD3D11GetDirect3DDevice", ("hipD3D11GetDirect3DDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsVDPAURegisterOutputSurface", ("hipGraphicsVDPAURegisterOutputSurface", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsVDPAURegisterVideoSurface", ("hipGraphicsVDPAURegisterVideoSurface", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuVDPAUGetDevice", ("hipVDPAUGetDevice", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuVDPAUCtxCreate", ("hipVDPAUCtxCreate", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamConsumerAcquireFrame", ("hipEGLStreamConsumerAcquireFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamConsumerConnect", ("hipEGLStreamConsumerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamConsumerConnectWithFlags", ("hipEGLStreamConsumerConnectWithFlags", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamConsumerDisconnect", ("hipEGLStreamConsumerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamConsumerReleaseFrame", ("hipEGLStreamConsumerReleaseFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamProducerConnect", ("hipEGLStreamProducerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamProducerDisconnect", ("hipEGLStreamProducerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamProducerPresentFrame", ("hipEGLStreamProducerPresentFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuEGLStreamProducerReturnFrame", ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsEGLRegisterImage", ("hipGraphicsEGLRegisterImage", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cuGraphicsResourceGetMappedEglFrame", ("hipGraphicsResourceGetMappedEglFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED)),
-    ("cudaDataType_t", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDataType", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_R_16F", ("hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_C_16F", ("hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_R_32F", ("hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_C_32F", ("hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_R_64F", ("hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_C_64F", ("hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_R_8I", ("hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_C_8I", ("hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_R_8U", ("hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_C_8U", ("hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_R_32I", ("hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_C_32I", ("hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_R_32U", ("hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("CUDA_C_32U", ("hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("MAJOR_VERSION", ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("MINOR_VERSION", ("hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("PATCH_LEVEL", ("hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAttachGlobal", ("hipMemAttachGlobal", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAttachHost", ("hipMemAttachHost", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAttachSingle", ("hipMemAttachSingle", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaOccupancyDefault", ("hipOccupancyDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaOccupancyDisableCachingOverride", ("hipOccupancyDisableCachingOverride", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetLastError", ("hipGetLastError", CONV_ERROR, API_RUNTIME)),
-    ("cudaPeekAtLastError", ("hipPeekAtLastError", CONV_ERROR, API_RUNTIME)),
-    ("cudaGetErrorName", ("hipGetErrorName", CONV_ERROR, API_RUNTIME)),
-    ("cudaGetErrorString", ("hipGetErrorString", CONV_ERROR, API_RUNTIME)),
-    ("cudaMemcpy3DParms", ("hipMemcpy3DParms", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpy3DPeerParms", ("hipMemcpy3DPeerParms", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpy", ("hipMemcpy", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyToArray", ("hipMemcpyToArray", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyToSymbol", ("hipMemcpyToSymbol", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyToSymbolAsync", ("hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyAsync", ("hipMemcpyAsync", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpy2D", ("hipMemcpy2D", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpy2DAsync", ("hipMemcpy2DAsync", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpy2DToArray", ("hipMemcpy2DToArray", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpy2DArrayToArray", ("hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpy2DFromArray", ("hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpy2DFromArrayAsync", ("hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpy2DToArrayAsync", ("hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpy3D", ("hipMemcpy3D", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpy3DAsync", ("hipMemcpy3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpy3DPeer", ("hipMemcpy3DPeer", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpy3DPeerAsync", ("hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpyArrayToArray", ("hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpyFromArrayAsync", ("hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpyFromSymbol", ("hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyFromSymbolAsync", ("hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME)),
-    ("cudaMemAdvise", ("hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemRangeGetAttribute", ("hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemRangeGetAttributes", ("hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAdviseSetReadMostly", ("hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAdviseUnsetReadMostly", ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAdviseSetPreferredLocation", ("hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAdviseUnsetPreferredLocation", ("hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAdviseSetAccessedBy", ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemAdviseUnsetAccessedBy", ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemRangeAttributeReadMostly", ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemRangeAttributePreferredLocation", ("hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemRangeAttributeAccessedBy", ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemRangeAttributeLastPrefetchLocation", ("hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemcpyHostToHost", ("hipMemcpyHostToHost", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyHostToDevice", ("hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyDeviceToHost", ("hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyDeviceToDevice", ("hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyDefault", ("hipMemcpyDefault", CONV_MEM, API_RUNTIME)),
-    ("cudaMemset", ("hipMemset", CONV_MEM, API_RUNTIME)),
-    ("cudaMemsetAsync", ("hipMemsetAsync", CONV_MEM, API_RUNTIME)),
-    ("cudaMemset2D", ("hipMemset2D", CONV_MEM, API_RUNTIME)),
-    ("cudaMemset2DAsync", ("hipMemset2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemset3D", ("hipMemset3D", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemset3DAsync", ("hipMemset3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemGetInfo", ("hipMemGetInfo", CONV_MEM, API_RUNTIME)),
-    ("cudaArrayGetInfo", ("hipArrayGetInfo", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaFreeMipmappedArray", ("hipFreeMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetMipmappedArrayLevel", ("hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetSymbolAddress", ("hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetSymbolSize", ("hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMemPrefetchAsync", ("hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMallocHost", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
-    ("cudaMallocArray", ("hipMallocArray", CONV_MEM, API_RUNTIME)),
-    ("cudaMalloc", ("hipMalloc", CONV_MEM, API_RUNTIME)),
-    ("cudaMalloc3D", ("hipMalloc3D", CONV_MEM, API_RUNTIME)),
-    ("cudaMalloc3DArray", ("hipMalloc3DArray", CONV_MEM, API_RUNTIME)),
-    ("cudaMallocManaged", ("hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMallocMipmappedArray", ("hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaMallocPitch", ("hipMallocPitch", CONV_MEM, API_RUNTIME)),
-    ("cudaFreeHost", ("hipHostFree", CONV_MEM, API_RUNTIME)),
-    ("cudaFreeArray", ("hipFreeArray", CONV_MEM, API_RUNTIME)),
-    ("cudaFree", ("hipFree", CONV_MEM, API_RUNTIME)),
-    ("cudaHostRegister", ("hipHostRegister", CONV_MEM, API_RUNTIME)),
-    ("cudaHostUnregister", ("hipHostUnregister", CONV_MEM, API_RUNTIME)),
-    ("cudaHostAlloc", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
-    ("cudaMemoryTypeHost", ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME)),
-    ("cudaMemoryTypeDevice", ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME)),
-    ("make_cudaExtent", ("make_hipExtent", CONV_MEM, API_RUNTIME)),
-    ("make_cudaPitchedPtr", ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME)),
-    ("make_cudaPos", ("make_hipPos", CONV_MEM, API_RUNTIME)),
-    ("cudaHostAllocDefault", ("hipHostMallocDefault", CONV_MEM, API_RUNTIME)),
-    ("cudaHostAllocPortable", ("hipHostMallocPortable", CONV_MEM, API_RUNTIME)),
-    ("cudaHostAllocMapped", ("hipHostMallocMapped", CONV_MEM, API_RUNTIME)),
-    ("cudaHostAllocWriteCombined", ("hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME)),
-    ("cudaHostGetFlags", ("hipHostGetFlags", CONV_MEM, API_RUNTIME)),
-    ("cudaHostRegisterDefault", ("hipHostRegisterDefault", CONV_MEM, API_RUNTIME)),
-    ("cudaHostRegisterPortable", ("hipHostRegisterPortable", CONV_MEM, API_RUNTIME)),
-    ("cudaHostRegisterMapped", ("hipHostRegisterMapped", CONV_MEM, API_RUNTIME)),
-    ("cudaHostRegisterIoMemory", ("hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME)),
-    # ("warpSize", ("hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME), (HIP actually uses warpSize...),
-    ("cudaEventCreate", ("hipEventCreate", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventCreateWithFlags", ("hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventDestroy", ("hipEventDestroy", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventRecord", ("hipEventRecord", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventQuery", ("hipEventQuery", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventDefault", ("hipEventDefault", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventBlockingSync", ("hipEventBlockingSync", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventDisableTiming", ("hipEventDisableTiming", CONV_EVENT, API_RUNTIME)),
-    ("cudaEventInterprocess", ("hipEventInterprocess", CONV_EVENT, API_RUNTIME)),
-    ("cudaStreamCreate", ("hipStreamCreate", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamCreateWithFlags", ("hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamCreateWithPriority", ("hipStreamCreateWithPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaStreamDestroy", ("hipStreamDestroy", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamQuery", ("hipStreamQuery", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_RUNTIME)),
-    ("cudaStreamAttachMemAsync", ("hipStreamAttachMemAsync", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaStreamGetPriority", ("hipStreamGetPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaStreamDefault", ("hipStreamDefault", CONV_TYPE, API_RUNTIME)),
-    ("cudaStreamNonBlocking", ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceSynchronize", ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME)),
-    ("cudaDeviceReset", ("hipDeviceReset", CONV_DEVICE, API_RUNTIME)),
-    ("cudaSetDevice", ("hipSetDevice", CONV_DEVICE, API_RUNTIME)),
-    ("cudaGetDevice", ("hipGetDevice", CONV_DEVICE, API_RUNTIME)),
-    ("cudaGetDeviceCount", ("hipGetDeviceCount", CONV_DEVICE, API_RUNTIME)),
-    ("cudaChooseDevice", ("hipChooseDevice", CONV_DEVICE, API_RUNTIME)),
-    ("cudaThreadExit", ("hipDeviceReset", CONV_THREAD, API_RUNTIME)),
-    ("cudaThreadGetCacheConfig", ("hipDeviceGetCacheConfig", CONV_THREAD, API_RUNTIME)),
-    ("cudaThreadGetLimit", ("hipThreadGetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaThreadSetCacheConfig", ("hipDeviceSetCacheConfig", CONV_THREAD, API_RUNTIME)),
-    ("cudaThreadSetLimit", ("hipThreadSetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaThreadSynchronize", ("hipDeviceSynchronize", CONV_THREAD, API_RUNTIME)),
-    ("cudaDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_RUNTIME)),
-    ("cudaDevAttrMaxThreadsPerBlock", ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxBlockDimX", ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxBlockDimY", ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxBlockDimZ", ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxGridDimX", ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxGridDimY", ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxGridDimZ", ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxSharedMemoryPerBlock", ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrTotalConstantMemory", ("hipDeviceAttributeTotalConstantMemory", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrWarpSize", ("hipDeviceAttributeWarpSize", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxPitch", ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxRegistersPerBlock", ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrClockRate", ("hipDeviceAttributeClockRate", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrTextureAlignment", ("hipDeviceAttributeTextureAlignment", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrGpuOverlap", ("hipDeviceAttributeGpuOverlap", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMultiProcessorCount", ("hipDeviceAttributeMultiprocessorCount", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrKernelExecTimeout", ("hipDeviceAttributeKernelExecTimeout", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrIntegrated", ("hipDeviceAttributeIntegrated", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrCanMapHostMemory", ("hipDeviceAttributeCanMapHostMemory", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrComputeMode", ("hipDeviceAttributeComputeMode", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxTexture1DWidth", ("hipDeviceAttributeMaxTexture1DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DWidth", ("hipDeviceAttributeMaxTexture2DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DHeight", ("hipDeviceAttributeMaxTexture2DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture3DWidth", ("hipDeviceAttributeMaxTexture3DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture3DHeight", ("hipDeviceAttributeMaxTexture3DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture3DDepth", ("hipDeviceAttributeMaxTexture3DDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DLayeredWidth", ("hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DLayeredHeight", ("hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DLayeredLayers", ("hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrSurfaceAlignment", ("hipDeviceAttributeSurfaceAlignment", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrConcurrentKernels", ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrEccEnabled", ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrPciBusId", ("hipDeviceAttributePciBusId", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrPciDeviceId", ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrTccDriver", ("hipDeviceAttributeTccDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMemoryClockRate", ("hipDeviceAttributeMemoryClockRate", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrGlobalMemoryBusWidth", ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrL2CacheSize", ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxThreadsPerMultiProcessor", ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrAsyncEngineCount", ("hipDeviceAttributeAsyncEngineCount", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrUnifiedAddressing", ("hipDeviceAttributeUnifiedAddressing", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture1DLayeredWidth", ("hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture1DLayeredLayers", ("hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DGatherWidth", ("hipDeviceAttributeMaxTexture2DGatherWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DGatherHeight", ("hipDeviceAttributeMaxTexture2DGatherHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture3DWidthAlt", ("hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture3DHeightAlt", ("hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture3DDepthAlt", ("hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrPciDomainId", ("hipDeviceAttributePciDomainId", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrTexturePitchAlignment", ("hipDeviceAttributeTexturePitchAlignment", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTextureCubemapWidth", ("hipDeviceAttributeMaxTextureCubemapWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTextureCubemapLayeredWidth", ("hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTextureCubemapLayeredLayers", ("hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface1DWidth", ("hipDeviceAttributeMaxSurface1DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface2DWidth", ("hipDeviceAttributeMaxSurface2DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface2DHeight", ("hipDeviceAttributeMaxSurface2DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface3DWidth", ("hipDeviceAttributeMaxSurface3DWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface3DHeight", ("hipDeviceAttributeMaxSurface3DHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface3DDepth", ("hipDeviceAttributeMaxSurface3DDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface1DLayeredWidth", ("hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface1DLayeredLayers", ("hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface2DLayeredWidth", ("hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface2DLayeredHeight", ("hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurface2DLayeredLayers", ("hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurfaceCubemapWidth", ("hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurfaceCubemapLayeredWidth", ("hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSurfaceCubemapLayeredLayers", ("hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture1DLinearWidth", ("hipDeviceAttributeMaxTexture1DLinearWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DLinearWidth", ("hipDeviceAttributeMaxTexture2DLinearWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DLinearHeight", ("hipDeviceAttributeMaxTexture2DLinearHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DLinearPitch", ("hipDeviceAttributeMaxTexture2DLinearPitch", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DMipmappedWidth", ("hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxTexture2DMipmappedHeight", ("hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrComputeCapabilityMajor", ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrComputeCapabilityMinor", ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxTexture1DMipmappedWidth", ("hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrStreamPrioritiesSupported", ("hipDeviceAttributeStreamPrioritiesSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrGlobalL1CacheSupported", ("hipDeviceAttributeGlobalL1CacheSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrLocalL1CacheSupported", ("hipDeviceAttributeLocalL1CacheSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrMaxSharedMemoryPerMultiprocessor", ("hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMaxRegistersPerMultiprocessor", ("hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrManagedMemory", ("hipDeviceAttributeManagedMemory", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrIsMultiGpuBoard", ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_RUNTIME)),
-    ("cudaDevAttrMultiGpuBoardGroupID", ("hipDeviceAttributeMultiGpuBoardGroupID", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrHostNativeAtomicSupported", ("hipDeviceAttributeHostNativeAtomicSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrSingleToDoublePrecisionPerfRatio", ("hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrPageableMemoryAccess", ("hipDeviceAttributePageableMemoryAccess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrConcurrentManagedAccess", ("hipDeviceAttributeConcurrentManagedAccess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrComputePreemptionSupported", ("hipDeviceAttributeComputePreemptionSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevAttrCanUseHostPointerForRegisteredMem", ("hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaPointerGetAttributes", ("hipPointerGetAttributes", CONV_MEM, API_RUNTIME)),
-    ("cudaHostGetDevicePointer", ("hipHostGetDevicePointer", CONV_MEM, API_RUNTIME)),
-    ("cudaGetDeviceProperties", ("hipGetDeviceProperties", CONV_DEVICE, API_RUNTIME)),
-    ("cudaDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_RUNTIME)),
-    ("cudaDeviceGetByPCIBusId", ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_RUNTIME)),
-    ("cudaDeviceGetStreamPriorityRange", ("hipDeviceGetStreamPriorityRange", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaSetValidDevices", ("hipSetValidDevices", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevP2PAttrPerformanceRank", ("hipDeviceP2PAttributePerformanceRank", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevP2PAttrAccessSupported", ("hipDeviceP2PAttributeAccessSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDevP2PAttrNativeAtomicSupported", ("hipDeviceP2PAttributeNativeAtomicSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDeviceGetP2PAttribute", ("hipDeviceGetP2PAttribute", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaComputeModeDefault", ("hipComputeModeDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaComputeModeExclusive", ("hipComputeModeExclusive", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaComputeModeProhibited", ("hipComputeModeProhibited", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaComputeModeExclusiveProcess", ("hipComputeModeExclusiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetDeviceFlags", ("hipGetDeviceFlags", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaSetDeviceFlags", ("hipSetDeviceFlags", CONV_DEVICE, API_RUNTIME)),
-    ("cudaDeviceScheduleAuto", ("hipDeviceScheduleAuto", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceScheduleSpin", ("hipDeviceScheduleSpin", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceScheduleYield", ("hipDeviceScheduleYield", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceBlockingSync", ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceScheduleBlockingSync", ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceScheduleMask", ("hipDeviceScheduleMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDeviceMapHost", ("hipDeviceMapHost", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceLmemResizeToMax", ("hipDeviceLmemResizeToMax", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDeviceMask", ("hipDeviceMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDeviceSetCacheConfig", ("hipDeviceSetCacheConfig", CONV_CACHE, API_RUNTIME)),
-    ("cudaDeviceGetCacheConfig", ("hipDeviceGetCacheConfig", CONV_CACHE, API_RUNTIME)),
-    ("cudaFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_CACHE, API_RUNTIME)),
-    ("cudaFuncCachePreferNone", ("hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME)),
-    ("cudaFuncCachePreferShared", ("hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME)),
-    ("cudaFuncCachePreferL1", ("hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME)),
-    ("cudaFuncCachePreferEqual", ("hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME)),
-    ("cudaFuncGetAttributes", ("hipFuncGetAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaFuncSetSharedMemConfig", ("hipFuncSetSharedMemConfig", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetParameterBuffer", ("hipGetParameterBuffer", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaSetDoubleForDevice", ("hipSetDoubleForDevice", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaSetDoubleForHost", ("hipSetDoubleForHost", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaConfigureCall", ("hipConfigureCall", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaLaunch", ("hipLaunch", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaSetupArgument", ("hipSetupArgument", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_RUNTIME)),
-    ("cudaRuntimeGetVersion", ("hipRuntimeGetVersion", CONV_VERSION, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaOccupancyMaxPotentialBlockSize", ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_RUNTIME)),
-    ("cudaOccupancyMaxPotentialBlockSizeWithFlags", ("hipOccupancyMaxPotentialBlockSizeWithFlags", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaOccupancyMaxActiveBlocksPerMultiprocessor", ("hipOccupancyMaxActiveBlocksPerMultiprocessor", CONV_OCCUPANCY, API_RUNTIME)),
-    ("cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", ("hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaOccupancyMaxPotentialBlockSizeVariableSMem", ("hipOccupancyMaxPotentialBlockSizeVariableSMem", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags", ("hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags", CONV_OCCUPANCY, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_RUNTIME)),
-    ("cudaDeviceDisablePeerAccess", ("hipDeviceDisablePeerAccess", CONV_PEER, API_RUNTIME)),
-    ("cudaDeviceEnablePeerAccess", ("hipDeviceEnablePeerAccess", CONV_PEER, API_RUNTIME)),
-    ("cudaMemcpyPeerAsync", ("hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME)),
-    ("cudaMemcpyPeer", ("hipMemcpyPeer", CONV_MEM, API_RUNTIME)),
-    ("cudaIpcMemLazyEnablePeerAccess", ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_RUNTIME)),
-    ("cudaDeviceSetSharedMemConfig", ("hipDeviceSetSharedMemConfig", CONV_DEVICE, API_RUNTIME)),
-    ("cudaDeviceGetSharedMemConfig", ("hipDeviceGetSharedMemConfig", CONV_DEVICE, API_RUNTIME)),
-    ("cudaSharedMemBankSizeDefault", ("hipSharedMemBankSizeDefault", CONV_TYPE, API_RUNTIME)),
-    ("cudaSharedMemBankSizeFourByte", ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_RUNTIME)),
-    ("cudaSharedMemBankSizeEightByte", ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_RUNTIME)),
-    ("cudaLimitStackSize", ("hipLimitStackSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaLimitPrintfFifoSize", ("hipLimitPrintfFifoSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaLimitMallocHeapSize", ("hipLimitMallocHeapSize", CONV_TYPE, API_RUNTIME)),
-    ("cudaLimitDevRuntimeSyncDepth", ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaLimitDevRuntimePendingLaunchCount", ("hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDeviceGetLimit", ("hipDeviceGetLimit", CONV_DEVICE, API_RUNTIME)),
-    ("cudaProfilerInitialize", ("hipProfilerInitialize", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaProfilerStart", ("hipProfilerStart", CONV_OTHER, API_RUNTIME)),
-    ("cudaProfilerStop", ("hipProfilerStop", CONV_OTHER, API_RUNTIME)),
-    ("cudaKeyValuePair", ("hipKeyValuePair", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaCSV", ("hipCSV", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaReadModeElementType", ("hipReadModeElementType", CONV_TEX, API_RUNTIME)),
-    ("cudaReadModeNormalizedFloat", ("hipReadModeNormalizedFloat", CONV_TEX, API_RUNTIME)),
-    ("cudaFilterModePoint", ("hipFilterModePoint", CONV_TEX, API_RUNTIME)),
-    ("cudaFilterModeLinear", ("hipFilterModeLinear", CONV_TEX, API_RUNTIME)),
-    ("cudaBindTexture", ("hipBindTexture", CONV_TEX, API_RUNTIME)),
-    ("cudaUnbindTexture", ("hipUnbindTexture", CONV_TEX, API_RUNTIME)),
-    ("cudaBindTexture2D", ("hipBindTexture2D", CONV_TEX, API_RUNTIME)),
-    ("cudaBindTextureToArray", ("hipBindTextureToArray", CONV_TEX, API_RUNTIME)),
-    ("cudaBindTextureToMipmappedArray", ("hipBindTextureToMipmappedArray", CONV_TEX, API_RUNTIME)),
-    ("cudaGetTextureAlignmentOffset", ("hipGetTextureAlignmentOffset", CONV_TEX, API_RUNTIME)),
-    ("cudaGetTextureReference", ("hipGetTextureReference", CONV_TEX, API_RUNTIME)),
-    ("cudaChannelFormatKindSigned", ("hipChannelFormatKindSigned", CONV_TEX, API_RUNTIME)),
-    ("cudaChannelFormatKindUnsigned", ("hipChannelFormatKindUnsigned", CONV_TEX, API_RUNTIME)),
-    ("cudaChannelFormatKindFloat", ("hipChannelFormatKindFloat", CONV_TEX, API_RUNTIME)),
-    ("cudaChannelFormatKindNone", ("hipChannelFormatKindNone", CONV_TEX, API_RUNTIME)),
-    ("cudaCreateChannelDesc", ("hipCreateChannelDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaGetChannelDesc", ("hipGetChannelDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceTypeArray", ("hipResourceTypeArray", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceTypeMipmappedArray", ("hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceTypeLinear", ("hipResourceTypeLinear", CONV_TEX, API_RUNTIME)),
-    ("cudaResourceTypePitch2D", ("hipResourceTypePitch2D", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatNone", ("hipResViewFormatNone", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedChar1", ("hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedChar2", ("hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedChar4", ("hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedChar1", ("hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedChar2", ("hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedChar4", ("hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedShort1", ("hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedShort2", ("hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedShort4", ("hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedShort1", ("hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedShort2", ("hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedShort4", ("hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedInt1", ("hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedInt2", ("hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedInt4", ("hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedInt1", ("hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedInt2", ("hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedInt4", ("hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatHalf1", ("hipResViewFormatHalf1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatHalf2", ("hipResViewFormatHalf2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatHalf4", ("hipResViewFormatHalf4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatFloat1", ("hipResViewFormatFloat1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatFloat2", ("hipResViewFormatFloat2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatFloat4", ("hipResViewFormatFloat4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedBlockCompressed1", ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedBlockCompressed2", ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedBlockCompressed3", ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedBlockCompressed4", ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedBlockCompressed4", ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedBlockCompressed5", ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedBlockCompressed5", ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedBlockCompressed6H", ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatSignedBlockCompressed6H", ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME)),
-    ("cudaResViewFormatUnsignedBlockCompressed7", ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME)),
-    ("cudaAddressModeWrap", ("hipAddressModeWrap", CONV_TEX, API_RUNTIME)),
-    ("cudaAddressModeClamp", ("hipAddressModeClamp", CONV_TEX, API_RUNTIME)),
-    ("cudaAddressModeMirror", ("hipAddressModeMirror", CONV_TEX, API_RUNTIME)),
-    ("cudaAddressModeBorder", ("hipAddressModeBorder", CONV_TEX, API_RUNTIME)),
-    ("cudaCreateTextureObject", ("hipCreateTextureObject", CONV_TEX, API_RUNTIME)),
-    ("cudaDestroyTextureObject", ("hipDestroyTextureObject", CONV_TEX, API_RUNTIME)),
-    ("cudaGetTextureObjectResourceDesc", ("hipGetTextureObjectResourceDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaGetTextureObjectResourceViewDesc", ("hipGetTextureObjectResourceViewDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaGetTextureObjectTextureDesc", ("hipGetTextureObjectTextureDesc", CONV_TEX, API_RUNTIME)),
-    ("cudaBindSurfaceToArray", ("hipBindSurfaceToArray", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetSurfaceReference", ("hipGetSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaBoundaryModeZero", ("hipBoundaryModeZero", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaBoundaryModeClamp", ("hipBoundaryModeClamp", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaBoundaryModeTrap", ("hipBoundaryModeTrap", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaFormatModeForced", ("hipFormatModeForced", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaFormatModeAuto", ("hipFormatModeAuto", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaCreateSurfaceObject", ("hipCreateSurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaDestroySurfaceObject", ("hipDestroySurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGetSurfaceObjectResourceDesc", ("hipGetSurfaceObjectResourceDesc", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaIpcCloseMemHandle", ("hipIpcCloseMemHandle", CONV_DEVICE, API_RUNTIME)),
-    ("cudaIpcGetEventHandle", ("hipIpcGetEventHandle", CONV_DEVICE, API_RUNTIME)),
-    ("cudaIpcGetMemHandle", ("hipIpcGetMemHandle", CONV_DEVICE, API_RUNTIME)),
-    ("cudaIpcOpenEventHandle", ("hipIpcOpenEventHandle", CONV_DEVICE, API_RUNTIME)),
-    ("cudaIpcOpenMemHandle", ("hipIpcOpenMemHandle", CONV_DEVICE, API_RUNTIME)),
-    ("cudaGLGetDevices", ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsGLRegisterBuffer", ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsGLRegisterImage", ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaWGLGetDevice", ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsMapResources", ("hipGraphicsMapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsResourceGetMappedMipmappedArray", ("hipGraphicsResourceGetMappedMipmappedArray", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsResourceGetMappedPointer", ("hipGraphicsResourceGetMappedPointer", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsResourceSetMapFlags", ("hipGraphicsResourceSetMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsSubResourceGetMappedArray", ("hipGraphicsSubResourceGetMappedArray", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsUnmapResources", ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsUnregisterResource", ("hipGraphicsUnregisterResource", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsCubeFacePositiveX", ("hipGraphicsCubeFacePositiveX", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsCubeFaceNegativeX", ("hipGraphicsCubeFaceNegativeX", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsCubeFacePositiveY", ("hipGraphicsCubeFacePositiveY", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsCubeFaceNegativeY", ("hipGraphicsCubeFaceNegativeY", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsCubeFacePositiveZ", ("hipGraphicsCubeFacePositiveZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsCubeFaceNegativeZ", ("hipGraphicsCubeFaceNegativeZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsMapFlagsNone", ("hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsMapFlagsReadOnly", ("hipGraphicsMapFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsMapFlagsWriteDiscard", ("hipGraphicsMapFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsRegisterFlagsNone", ("hipGraphicsRegisterFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsRegisterFlagsReadOnly", ("hipGraphicsRegisterFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsRegisterFlagsWriteDiscard", ("hipGraphicsRegisterFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsRegisterFlagsSurfaceLoadStore", ("hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsRegisterFlagsTextureGather", ("hipGraphicsRegisterFlagsTextureGather", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLDeviceListAll", ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLDeviceListCurrentFrame", ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLDeviceListNextFrame", ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLGetDevices", ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsGLRegisterBuffer", ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsGLRegisterImage", ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaWGLGetDevice", ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLMapFlagsNone", ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLMapFlagsReadOnly", ("HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLMapFlagsWriteDiscard", ("HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLMapBufferObject", ("hipGLMapBufferObject__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLMapBufferObjectAsync", ("hipGLMapBufferObjectAsync__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLRegisterBufferObject", ("hipGLRegisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLSetBufferObjectMapFlags", ("hipGLSetBufferObjectMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLSetGLDevice", ("hipGLSetGLDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLUnmapBufferObject", ("hipGLUnmapBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLUnmapBufferObjectAsync", ("hipGLUnmapBufferObjectAsync", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGLUnregisterBufferObject", ("hipGLUnregisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9DeviceListAll", ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9DeviceListCurrentFrame", ("HIP_D3D9_DEVICE_LIST_CURRENT_FRAME", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9DeviceListNextFrame", ("HIP_D3D9_DEVICE_LIST_NEXT_FRAME", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9GetDevice", ("hipD3D9GetDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9GetDevices", ("hipD3D9GetDevices", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9GetDirect3DDevice", ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9SetDirect3DDevice", ("hipD3D9SetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsD3D9RegisterResource", ("hipGraphicsD3D9RegisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9MapFlags", ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9MapFlagsNone", ("HIP_D3D9_MAPRESOURCE_FLAGS_NONE", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9MapFlagsReadOnly", ("HIP_D3D9_MAPRESOURCE_FLAGS_READONLY", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9MapFlagsWriteDiscard", ("HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9RegisterFlagsNone", ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9RegisterFlagsArray", ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9MapResources", ("hipD3D9MapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9RegisterResource", ("hipD3D9RegisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9ResourceGetMappedArray", ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9ResourceGetMappedPitch", ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9ResourceGetMappedPointer", ("hipD3D9ResourceGetMappedPointer", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9ResourceGetMappedSize", ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9ResourceGetSurfaceDimensions", ("hipD3D9ResourceGetSurfaceDimensions", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9ResourceSetMapFlags", ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9UnmapResources", ("hipD3D9UnmapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D9UnregisterResource", ("hipD3D9UnregisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10DeviceListAll", ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10DeviceListCurrentFrame", ("HIP_D3D10_DEVICE_LIST_CURRENT_FRAME", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10DeviceListNextFrame", ("HIP_D3D10_DEVICE_LIST_NEXT_FRAME", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10GetDevice", ("hipD3D10GetDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10GetDevices", ("hipD3D10GetDevices", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsD3D10RegisterResource", ("hipGraphicsD3D10RegisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10MapFlagsNone", ("HIP_D3D10_MAPRESOURCE_FLAGS_NONE", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10MapFlagsReadOnly", ("HIP_D3D10_MAPRESOURCE_FLAGS_READONLY", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10MapFlagsWriteDiscard", ("HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10RegisterFlagsNone", ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10RegisterFlagsArray", ("HIP_D3D10_REGISTER_FLAGS_ARRAY", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10GetDirect3DDevice", ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10MapResources", ("hipD3D10MapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10RegisterResource", ("hipD3D10RegisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10ResourceGetMappedArray", ("hipD3D10ResourceGetMappedArray", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10ResourceGetMappedPitch", ("hipD3D10ResourceGetMappedPitch", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10ResourceGetMappedPointer", ("hipD3D10ResourceGetMappedPointer", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10ResourceGetMappedSize", ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10ResourceGetSurfaceDimensions", ("hipD3D10ResourceGetSurfaceDimensions", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10ResourceSetMapFlags", ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10SetDirect3DDevice", ("hipD3D10SetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10UnmapResources", ("hipD3D10UnmapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D10UnregisterResource", ("hipD3D10UnregisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11DeviceListAll", ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11DeviceListCurrentFrame", ("HIP_D3D11_DEVICE_LIST_CURRENT_FRAME", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11DeviceListNextFrame", ("HIP_D3D11_DEVICE_LIST_NEXT_FRAME", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11GetDevice", ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11GetDevices", ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsD3D11RegisterResource", ("hipGraphicsD3D11RegisterResource", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11GetDevice", ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaD3D11GetDevices", ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsD3D11RegisterResource", ("hipGraphicsD3D11RegisterResource", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsVDPAURegisterOutputSurface", ("hipGraphicsVDPAURegisterOutputSurface", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsVDPAURegisterVideoSurface", ("hipGraphicsVDPAURegisterVideoSurface", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaVDPAUGetDevice", ("hipVDPAUGetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaVDPAUSetVDPAUDevice", ("hipVDPAUSetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamConsumerAcquireFrame", ("hipEGLStreamConsumerAcquireFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamConsumerConnect", ("hipEGLStreamConsumerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamConsumerConnectWithFlags", ("hipEGLStreamConsumerConnectWithFlags", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamConsumerReleaseFrame", ("hipEGLStreamConsumerReleaseFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamProducerConnect", ("hipEGLStreamProducerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamProducerDisconnect", ("hipEGLStreamProducerDisconnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamProducerPresentFrame", ("hipEGLStreamProducerPresentFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaEGLStreamProducerReturnFrame", ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsEGLRegisterImage", ("hipGraphicsEGLRegisterImage", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cudaGraphicsResourceGetMappedEglFrame", ("hipGraphicsResourceGetMappedEglFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED)),
-    ("cublasInit", ("rocblas_init", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasShutdown", ("rocblas_shutdown", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasGetVersion", ("rocblas_get_version", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasGetError", ("rocblas_get_error", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasAlloc", ("rocblas_alloc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasFree", ("rocblas_free", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSetKernelStream", ("rocblas_set_kernel_stream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasGetAtomicsMode", ("rocblas_get_atomics_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSetAtomicsMode", ("rocblas_set_atomics_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasGetMathMode", ("rocblas_get_math_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSetMathMode", ("rocblas_set_math_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("CUBLAS_OP_N", ("rocblas_operation_none", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_OP_T", ("rocblas_operation_transpose", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_OP_C", ("rocblas_operation_conjugate_transpose", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_SUCCESS", ("rocblas_status_success", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_NOT_INITIALIZED", ("rocblas_status_invalid_handle", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_ALLOC_FAILED", ("rocblas_status_memory_error", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_INVALID_VALUE", ("rocblas_status_invalid_pointer", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_MAPPING_ERROR", ("rocblas_status_internal_error", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_EXECUTION_FAILED", ("rocblas_status_internal_error", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_INTERNAL_ERROR", ("rocblas_status_internal_error", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_NOT_SUPPORTED", ("rocblas_status_not_implemented", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_STATUS_ARCH_MISMATCH", ("rocblas_status_not_implemented", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_FILL_MODE_LOWER", ("rocblas_fill_lower", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_FILL_MODE_UPPER", ("rocblas_fill_upper", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_DIAG_NON_UNIT", ("rocblas_diagonal_non_unit", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_DIAG_UNIT", ("rocblas_diagonal_unit", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_SIDE_LEFT", ("rocblas_side_left", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_SIDE_RIGHT", ("rocblas_side_right", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_POINTER_MODE_HOST", ("rocblas_pointer_mode_host", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_POINTER_MODE_DEVICE", ("rocblas_pointer_mode_device", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUBLAS_ATOMICS_NOT_ALLOWED", ("rocblas_atomics_not_allowed", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED)),
-    ("CUBLAS_ATOMICS_ALLOWED", ("rocblas_atomics_allowed", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED)),
-    ("CUBLAS_DATA_FLOAT", ("rocblas_precision_float", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED)),
-    ("CUBLAS_DATA_DOUBLE", ("rocblas_precision_double", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED)),
-    ("CUBLAS_DATA_HALF", ("rocblas_precision_half", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED)),
-    ("CUBLAS_DATA_INT8", ("rocblas_precision_int8", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCreate", ("rocblas_create_handle", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDestroy", ("rocblas_destroy_handle", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSetVector", ("rocblas_set_vector", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetVector", ("rocblas_get_vector", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSetVectorAsync", ("rocblas_set_vector_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasGetVectorAsync", ("rocblas_get_vector_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSetMatrix", ("rocblas_set_matrix", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetMatrix", ("rocblas_get_matrix", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetMatrixAsync", ("rocblas_get_matrix_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSetMatrixAsync", ("rocblas_set_matrix_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasXerbla", ("rocblas_xerbla", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSnrm2", ("rocblas_snrm2", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDnrm2", ("rocblas_dnrm2", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasScnrm2", ("rocblas_scnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDznrm2", ("rocblas_dznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasNrm2Ex", ("rocblas_nrm2_ex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSdot", ("rocblas_sdot", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSdotBatched", ("rocblas_sdot_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDdot", ("rocblas_ddot", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDdotBatched", ("rocblas_ddot_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCdotu", ("rocblas_cdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCdotc", ("rocblas_cdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdotu", ("rocblas_zdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdotc", ("rocblas_zdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSscal", ("rocblas_sscal", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSscalBatched", ("rocblas_sscal_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDscal", ("rocblas_dscal", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDscalBatched", ("rocblas_dscal_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCscal", ("rocblas_cscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsscal", ("rocblas_csscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZscal", ("rocblas_zscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdscal", ("rocblas_zdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSaxpy", ("rocblas_saxpy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSaxpyBatched", ("rocblas_saxpy_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDaxpy", ("rocblas_daxpy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCaxpy", ("rocblas_caxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZaxpy", ("rocblas_zaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasScopy", ("rocblas_scopy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasScopyBatched", ("rocblas_scopy_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDcopy", ("rocblas_dcopy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDcopyBatched", ("rocblas_dcopy_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCcopy", ("rocblas_ccopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZcopy", ("rocblas_zcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSswap", ("rocblas_sswap", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDswap", ("rocblas_dswap", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCswap", ("rocblas_cswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZswap", ("rocblas_zswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIsamax", ("rocblas_isamax", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIdamax", ("rocblas_idamax", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIcamax", ("rocblas_icamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIzamax", ("rocblas_izamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIsamin", ("rocblas_isamin", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIdamin", ("rocblas_idamin", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIcamin", ("rocblas_icamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIzamin", ("rocblas_izamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSasum", ("rocblas_sasum", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSasumBatched", ("rocblas_sasum_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDasum", ("rocblas_dasum", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDasumBatched", ("rocblas_dasum_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasScasum", ("rocblas_scasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDzasum", ("rocblas_dzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrot", ("rocblas_srot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrot", ("rocblas_drot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCrot", ("rocblas_crot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsrot", ("rocblas_csrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZrot", ("rocblas_zrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdrot", ("rocblas_zdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrotg", ("rocblas_srotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrotg", ("rocblas_drotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCrotg", ("rocblas_crotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZrotg", ("rocblas_zrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrotm", ("rocblas_srotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrotm", ("rocblas_drotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrotmg", ("rocblas_srotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrotmg", ("rocblas_drotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgemv", ("rocblas_sgemv", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSgemvBatched", ("rocblas_sgemv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgemv", ("rocblas_dgemv", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgemv", ("rocblas_cgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgemv", ("rocblas_zgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgbmv", ("rocblas_sgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgbmv", ("rocblas_dgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgbmv", ("rocblas_cgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgbmv", ("rocblas_zgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrmv", ("rocblas_strmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrmv", ("rocblas_dtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrmv", ("rocblas_ctrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrmv", ("rocblas_ztrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStbmv", ("rocblas_stbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtbmv", ("rocblas_dtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtbmv", ("rocblas_ctbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtbmv", ("rocblas_ztbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStpmv", ("rocblas_stpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtpmv", ("rocblas_dtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtpmv", ("rocblas_ctpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtpmv", ("rocblas_ztpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrsv", ("rocblas_strsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrsv", ("rocblas_dtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrsv", ("rocblas_ctrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrsv", ("rocblas_ztrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStpsv", ("rocblas_stpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtpsv", ("rocblas_dtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtpsv", ("rocblas_ctpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtpsv", ("rocblas_ztpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStbsv", ("rocblas_stbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtbsv", ("rocblas_dtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtbsv", ("rocblas_ctbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtbsv", ("rocblas_ztbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsymv", ("rocblas_ssymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsymv", ("rocblas_dsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsymv", ("rocblas_csymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsymv", ("rocblas_zsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChemv", ("rocblas_chemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhemv", ("rocblas_zhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsbmv", ("rocblas_ssbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsbmv", ("rocblas_dsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChbmv", ("rocblas_chbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhbmv", ("rocblas_zhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSspmv", ("rocblas_sspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDspmv", ("rocblas_dspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChpmv", ("rocblas_chpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhpmv", ("rocblas_zhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSger", ("rocblas_sger", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDger", ("rocblas_dger", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgeru", ("rocblas_cgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgerc", ("rocblas_cgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgeru", ("rocblas_zgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgerc", ("rocblas_zgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyr", ("rocblas_ssyr", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDsyr", ("rocblas_dsyr", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCher", ("rocblas_cher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZher", ("rocblas_zher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSspr", ("rocblas_sspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDspr", ("rocblas_dspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChpr", ("rocblas_chpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhpr", ("rocblas_zhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyr2", ("rocblas_ssyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyr2", ("rocblas_dsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCher2", ("rocblas_cher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZher2", ("rocblas_zher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSspr2", ("rocblas_sspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDspr2", ("rocblas_dspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChpr2", ("rocblas_chpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhpr2", ("rocblas_zhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgemmBatched", ("rocblas_sgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgemmBatched", ("rocblas_dgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasHgemmBatched", ("rocblas_hgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgemmStridedBatched", ("rocblas_sgemm_strided_batched", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDgemmStridedBatched", ("rocblas_dgemm_strided_batched", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasHgemmStridedBatched", ("rocblas_hgemm_strided_batched", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgemmBatched", ("rocblas_cgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgemm3mBatched", ("rocblas_cgemm_3m_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgemmBatched", ("rocblas_zgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgemmStridedBatched", ("rocblas_cgemm_strided_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgemm3mStridedBatched", ("rocblas_cgemm_3m_strided_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgemmStridedBatched", ("rocblas_zgemm_strided_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasHgemmStridedBatched", ("rocblas_hgemm_strided_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgemm", ("rocblas_sgemm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDgemm", ("rocblas_dgemm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgemm", ("rocblas_cgemm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasZgemm", ("rocblas_zgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasHgemm", ("rocblas_hgemm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSsyrk", ("rocblas_ssyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyrk", ("rocblas_dsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyrk", ("rocblas_csyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsyrk", ("rocblas_zsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCherk", ("rocblas_cherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZherk", ("rocblas_zherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyr2k", ("rocblas_ssyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyr2k", ("rocblas_dsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyr2k", ("rocblas_csyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsyr2k", ("rocblas_zyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyrkx", ("rocblas_ssyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyrkx", ("rocblas_dsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyrkx", ("rocblas_csyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsyrkx", ("rocblas_zsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCher2k", ("rocblas_cher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZher2k", ("rocblas_zher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCherkx", ("rocblas_cherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZherkx", ("rocblas_zherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsymm", ("rocblas_ssymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsymm", ("rocblas_dsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsymm", ("rocblas_csymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsymm", ("rocblas_zsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChemm", ("rocblas_chemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhemm", ("rocblas_zhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrsm", ("rocblas_strsm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDtrsm", ("rocblas_dtrsm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCtrsm", ("rocblas_ctrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrsm", ("rocblas_ztrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrsmBatched", ("rocblas_strsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrsmBatched", ("rocblas_dtrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrsmBatched", ("rocblas_ctrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrsmBatched", ("rocblas_ztrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrmm", ("rocblas_strmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrmm", ("rocblas_dtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrmm", ("rocblas_ctrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrmm", ("rocblas_ztrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgeam", ("rocblas_sgeam", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDgeam", ("rocblas_dgeam", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgeam", ("rocblas_cgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgeam", ("rocblas_zgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgetrfBatched", ("rocblas_sgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgetrfBatched", ("rocblas_dgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgetrfBatched", ("rocblas_cgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgetrfBatched", ("rocblas_zgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgetriBatched", ("rocblas_sgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgetriBatched", ("rocblas_dgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgetriBatched", ("rocblas_cgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgetriBatched", ("rocblas_zgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgetrsBatched", ("rocblas_sgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgetrsBatched", ("rocblas_dgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgetrsBatched", ("rocblas_cgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgetrsBatched", ("rocblas_zgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrsmBatched", ("rocblas_strsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrsmBatched", ("rocblas_dtrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrsmBatched", ("rocblas_ctrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrsmBatched", ("rocblas_dtrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSmatinvBatched", ("rocblas_smatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDmatinvBatched", ("rocblas_dmatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCmatinvBatched", ("rocblas_cmatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZmatinvBatched", ("rocblas_zmatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgeqrfBatched", ("rocblas_sgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgeqrfBatched", ("rocblas_dgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgeqrfBatched", ("rocblas_cgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgeqrfBatched", ("rocblas_zgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgelsBatched", ("rocblas_sgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgelsBatched", ("rocblas_dgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgelsBatched", ("rocblas_cgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgelsBatched", ("rocblas_zgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSdgmm", ("rocblas_sdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDdgmm", ("rocblas_ddgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCdgmm", ("rocblas_cdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdgmm", ("rocblas_zdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStpttr", ("rocblas_stpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtpttr", ("rocblas_dtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtpttr", ("rocblas_ctpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtpttr", ("rocblas_ztpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrttp", ("rocblas_strttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrttp", ("rocblas_dtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrttp", ("rocblas_ctrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrttp", ("rocblas_ztrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCreate_v2", ("rocblas_create_handle", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDestroy_v2", ("rocblas_destroy_handle", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetVersion_v2", ("rocblas_get_version", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSetStream", ("rocblas_set_stream", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetStream", ("rocblas_get_stream", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSetStream_v2", ("rocblas_set_stream", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetStream_v2", ("rocblas_get_stream", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetPointerMode", ("rocblas_get_pointer_mode", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSetPointerMode", ("rocblas_set_pointer_mode", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasGetPointerMode_v2", ("rocblas_get_pointer_mode", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSetPointerMode_v2", ("rocblas_set_pointer_mode", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasSgemv_v2", ("rocblas_sgemv", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDgemv_v2", ("rocblas_dgemv", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgemv_v2", ("rocblas_cgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgemv_v2", ("rocblas_zgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgbmv_v2", ("rocblas_sgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDgbmv_v2", ("rocblas_dgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgbmv_v2", ("rocblas_cgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgbmv_v2", ("rocblas_zgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrmv_v2", ("rocblas_strmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrmv_v2", ("rocblas_dtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrmv_v2", ("rocblas_ctrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrmv_v2", ("rocblas_ztrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStbmv_v2", ("rocblas_stbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtbmv_v2", ("rocblas_dtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtbmv_v2", ("rocblas_ctbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtbmv_v2", ("rocblas_ztbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStpmv_v2", ("rocblas_stpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtpmv_v2", ("rocblas_dtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtpmv_v2", ("rocblas_ctpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtpmv_v2", ("rocblas_ztpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrsv_v2", ("rocblas_strsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrsv_v2", ("rocblas_dtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrsv_v2", ("rocblas_ctrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrsv_v2", ("rocblas_ztrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStpsv_v2", ("rocblas_stpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtpsv_v2", ("rocblas_dtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtpsv_v2", ("rocblas_ctpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtpsv_v2", ("rocblas_ztpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStbsv_v2", ("rocblas_stbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtbsv_v2", ("rocblas_dtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtbsv_v2", ("rocblas_ctbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtbsv_v2", ("rocblas_ztbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsymv_v2", ("rocblas_ssymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsymv_v2", ("rocblas_dsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsymv_v2", ("rocblas_csymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsymv_v2", ("rocblas_zsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChemv_v2", ("rocblas_chemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhemv_v2", ("rocblas_zhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsbmv_v2", ("rocblas_ssbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsbmv_v2", ("rocblas_dsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChbmv_v2", ("rocblas_chbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhbmv_v2", ("rocblas_zhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSspmv_v2", ("rocblas_sspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDspmv_v2", ("rocblas_dspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChpmv_v2", ("rocblas_chpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhpmv_v2", ("rocblas_zhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSger_v2", ("rocblas_sger", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDger_v2", ("rocblas_dger", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgeru_v2", ("rocblas_cgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgerc_v2", ("rocblas_cergc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgeru_v2", ("rocblas_zgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgerc_v2", ("rocblas_zgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyr_v2", ("rocblas_ssyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyr_v2", ("rocblas_dsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyr_v2", ("rocblas_csyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsyr_v2", ("rocblas_zsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCher_v2", ("rocblas_cher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZher_v2", ("rocblas_zher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSspr_v2", ("rocblas_sspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDspr_v2", ("rocblas_dspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChpr_v2", ("rocblas_chpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhpr_v2", ("rocblas_zhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyr2_v2", ("rocblas_ssyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyr2_v2", ("rocblas_dsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyr2_v2", ("rocblas_csyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsyr2_v2", ("rocblas_zsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCher2_v2", ("rocblas_cher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZher2_v2", ("rocblas_zher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSspr2_v2", ("rocblas_sspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDspr2_v2", ("rocblas_dspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChpr2_v2", ("rocblas_chpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhpr2_v2", ("rocblas_zhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSgemm_v2", ("rocblas_sgemm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDgemm_v2", ("rocblas_dgemm", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCgemm_v2", ("rocblas_cgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgemm3m", ("rocblas_cgemm_3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgemm3mEx", ("rocblas_cgemm_3mex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgemm_v2", ("rocblas_zgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZgemm3m", ("rocblas_zgemm_3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    # NB: The function rocblas_sgemmex doesn't actually exist in
-    # rocblas, as of 2018-12-05
-    ("cublasSgemmEx", ("rocblas_sgemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasGemmEx", ("rocblas_gemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCgemmEx", ("rocblas_cgemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasUint8gemmBias", ("rocblas_uint8gemmbias", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyrk_v2", ("rocblas_ssyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyrk_v2", ("rocblas_dsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyrk_v2", ("rocblas_csyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsyrk_v2", ("rocblas_zsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyrkEx", ("rocblas_csyrkex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyrk3mEx", ("rocblas_csyrk3mex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCherk_v2", ("rocblas_cherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCherkEx", ("rocblas_cherkex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCherk3mEx", ("rocblas_cherk3mex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZherk_v2", ("rocblas_zherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsyr2k_v2", ("rocblas_ssyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsyr2k_v2", ("rocblas_dsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsyr2k_v2", ("rocblas_csyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsyr2k_v2", ("rocblas_zsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCher2k_v2", ("rocblas_cher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZher2k_v2", ("rocblas_zher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSsymm_v2", ("rocblas_ssymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDsymm_v2", ("rocblas_dsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsymm_v2", ("rocblas_csymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZsymm_v2", ("rocblas_zsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasChemm_v2", ("rocblas_chemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZhemm_v2", ("rocblas_zhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrsm_v2", ("rocblas_strsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrsm_v2", ("rocblas_dtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrsm_v2", ("rocblas_ctrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrsm_v2", ("rocblas_ztrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasStrmm_v2", ("rocblas_strmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDtrmm_v2", ("rocblas_dtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCtrmm_v2", ("rocblas_ctrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZtrmm_v2", ("rocblas_ztrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSnrm2_v2", ("rocblas_snrm2", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDnrm2_v2", ("rocblas_dnrm2", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasScnrm2_v2", ("rocblas_scnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDznrm2_v2", ("rocblas_dznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDotEx", ("rocblas_dotex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDotcEx", ("rocblas_dotcex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSdot_v2", ("rocblas_sdot", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDdot_v2", ("rocblas_ddot", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCdotu_v2", ("rocblas_cdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCdotc_v2", ("rocblas_cdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdotu_v2", ("rocblas_zdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdotc_v2", ("rocblas_zdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasScalEx", ("rocblas_scalex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSscal_v2", ("rocblas_sscal", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDscal_v2", ("rocblas_dscal", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCscal_v2", ("rocblas_cscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsscal_v2", ("rocblas_csscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZscal_v2", ("rocblas_zcsal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdscal_v2", ("rocblas_zdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasAxpyEx", ("rocblas_axpyex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSaxpy_v2", ("rocblas_saxpy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDaxpy_v2", ("rocblas_daxpy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCaxpy_v2", ("rocblas_caxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZaxpy_v2", ("rocblas_zaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasScopy_v2", ("rocblas_scopy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDcopy_v2", ("rocblas_dcopy", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCcopy_v2", ("rocblas_ccopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZcopy_v2", ("rocblas_zcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSswap_v2", ("rocblas_sswap", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDswap_v2", ("rocblas_dswap", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasCswap_v2", ("rocblas_cswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZswap_v2", ("rocblas_zswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIsamax_v2", ("rocblas_isamax", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIdamax_v2", ("rocblas_idamax", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIcamax_v2", ("rocblas_icamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIzamax_v2", ("rocblas_izamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIsamin_v2", ("rocblas_isamin", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIdamin_v2", ("rocblas_idamin", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasIcamin_v2", ("rocblas_icamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasIzamin_v2", ("rocblas_izamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSasum_v2", ("rocblas_sasum", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasDasum_v2", ("rocblas_dasum", CONV_MATH_FUNC, API_BLAS)),
-    ("cublasScasum_v2", ("rocblas_scasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDzasum_v2", ("rocblas_dzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrot_v2", ("rocblas_srot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrot_v2", ("rocblas_drot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCrot_v2", ("rocblas_crot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCsrot_v2", ("rocblas_csrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZrot_v2", ("rocblas_zrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZdrot_v2", ("rocblas_zdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrotg_v2", ("rocblas_srotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrotg_v2", ("rocblas_drotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasCrotg_v2", ("rocblas_crotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasZrotg_v2", ("rocblas_zrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrotm_v2", ("rocblas_srotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrotm_v2", ("rocblas_drotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasSrotmg_v2", ("rocblas_srotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("cublasDrotmg_v2", ("rocblas_drotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
-    ("CURAND_STATUS_SUCCESS", ("HIPRAND_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_VERSION_MISMATCH", ("HIPRAND_STATUS_VERSION_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_NOT_INITIALIZED", ("HIPRAND_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_ALLOCATION_FAILED", ("HIPRAND_STATUS_ALLOCATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_TYPE_ERROR", ("HIPRAND_STATUS_TYPE_ERROR", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_OUT_OF_RANGE", ("HIPRAND_STATUS_OUT_OF_RANGE", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_LENGTH_NOT_MULTIPLE", ("HIPRAND_STATUS_LENGTH_NOT_MULTIPLE", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_DOUBLE_PRECISION_REQUIRED", ("HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_LAUNCH_FAILURE", ("HIPRAND_STATUS_LAUNCH_FAILURE", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_PREEXISTING_FAILURE", ("HIPRAND_STATUS_PREEXISTING_FAILURE", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_INITIALIZATION_FAILED", ("HIPRAND_STATUS_INITIALIZATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_ARCH_MISMATCH", ("HIPRAND_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_STATUS_INTERNAL_ERROR", ("HIPRAND_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_TEST", ("HIPRAND_RNG_TEST", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("mtgp32dc_params_fast_11213", ("mtgp32dc_params_fast_11213", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_PSEUDO_DEFAULT", ("HIPRAND_RNG_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_PSEUDO_XORWOW", ("HIPRAND_RNG_PSEUDO_XORWOW", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_PSEUDO_MRG32K3A", ("HIPRAND_RNG_PSEUDO_MRG32K3A", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_PSEUDO_MTGP32", ("HIPRAND_RNG_PSEUDO_MTGP32", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_PSEUDO_MT19937", ("HIPRAND_RNG_PSEUDO_MT19937", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_PSEUDO_PHILOX4_32_10", ("HIPRAND_RNG_PSEUDO_PHILOX4_32_10", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_QUASI_DEFAULT", ("HIPRAND_RNG_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_QUASI_SOBOL32", ("HIPRAND_RNG_QUASI_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_QUASI_SCRAMBLED_SOBOL32", ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_QUASI_SOBOL64", ("HIPRAND_RNG_QUASI_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("CURAND_RNG_QUASI_SCRAMBLED_SOBOL64", ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND)),
-    ("curand_ORDERING_PSEUDO_BEST", ("HIPRAND_ORDERING_PSEUDO_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_ORDERING_PSEUDO_DEFAULT", ("HIPRAND_ORDERING_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_ORDERING_PSEUDO_SEEDED", ("HIPRAND_ORDERING_PSEUDO_SEEDED", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_ORDERING_QUASI_DEFAULT", ("HIPRAND_ORDERING_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_DIRECTION_VECTORS_32_JOEKUO6", ("HIPRAND_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", ("HIPRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_DIRECTION_VECTORS_64_JOEKUO6", ("HIPRAND_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", ("HIPRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_CHOOSE_BEST", ("HIPRAND_CHOOSE_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_ITR", ("HIPRAND_ITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_KNUTH", ("HIPRAND_KNUTH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_HITR", ("HIPRAND_HITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_M1", ("HIPRAND_M1", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_M2", ("HIPRAND_M2", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_BINARY_SEARCH", ("HIPRAND_BINARY_SEARCH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_DISCRETE_GAUSS", ("HIPRAND_DISCRETE_GAUSS", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_REJECTION", ("HIPRAND_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_DEVICE_API", ("HIPRAND_DEVICE_API", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_FAST_REJECTION", ("HIPRAND_FAST_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_3RD", ("HIPRAND_3RD", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_DEFINITION", ("HIPRAND_DEFINITION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_POISSON", ("HIPRAND_POISSON", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
-    ("curandCreateGenerator", ("hiprandCreateGenerator", CONV_MATH_FUNC, API_RAND)),
-    ("curandCreateGeneratorHost", ("hiprandCreateGeneratorHost", CONV_MATH_FUNC, API_RAND)),
-    ("curandCreatePoissonDistribution", ("hiprandCreatePoissonDistribution", CONV_MATH_FUNC, API_RAND)),
-    ("curandDestroyDistribution", ("hiprandDestroyDistribution", CONV_MATH_FUNC, API_RAND)),
-    ("curandDestroyGenerator", ("hiprandDestroyGenerator", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerate", ("hiprandGenerate", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerateLogNormal", ("hiprandGenerateLogNormal", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerateLogNormalDouble", ("hiprandGenerateLogNormalDouble", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerateLongLong", ("hiprandGenerateLongLong", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curandGenerateNormal", ("hiprandGenerateNormal", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerateNormalDouble", ("hiprandGenerateNormalDouble", CONV_MATH_FUNC, API_RAND)),
-    ("curandGeneratePoisson", ("hiprandGeneratePoisson", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerateSeeds", ("hiprandGenerateSeeds", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerateUniform", ("hiprandGenerateUniform", CONV_MATH_FUNC, API_RAND)),
-    ("curandGenerateUniformDouble", ("hiprandGenerateUniformDouble", CONV_MATH_FUNC, API_RAND)),
-    ("curandGetDirectionVectors32", ("hiprandGetDirectionVectors32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curandGetDirectionVectors64", ("hiprandGetDirectionVectors64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curandGetProperty", ("hiprandGetProperty", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curandGetScrambleConstants32", ("hiprandGetScrambleConstants32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curandGetScrambleConstants64", ("hiprandGetScrambleConstants64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curandGetVersion", ("hiprandGetVersion", CONV_MATH_FUNC, API_RAND)),
-    ("curandSetGeneratorOffset", ("hiprandSetGeneratorOffset", CONV_MATH_FUNC, API_RAND)),
-    ("curandSetGeneratorOrdering", ("hiprandSetGeneratorOrdering", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curandSetPseudoRandomGeneratorSeed", ("hiprandSetPseudoRandomGeneratorSeed", CONV_MATH_FUNC, API_RAND)),
-    ("curandSetQuasiRandomGeneratorDimensions", ("hiprandSetQuasiRandomGeneratorDimensions", CONV_MATH_FUNC, API_RAND)),
-    ("curandSetStream", ("hiprandSetStream", CONV_MATH_FUNC, API_RAND)),
-    ("curand", ("hiprand", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand4", ("hiprand4", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_init", ("hiprand_init", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_log_normal", ("hiprand_log_normal", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_log_normal_double", ("hiprand_log_normal_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_log_normal2", ("hiprand_log_normal2", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_log_normal2_double", ("hiprand_log_normal2_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_log_normal4", ("hiprand_log_normal4", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_log_normal4_double", ("hiprand_log_normal4_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_mtgp32_single", ("hiprand_mtgp32_single", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_mtgp32_single_specific", ("hiprand_mtgp32_single_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_mtgp32_specific", ("hiprand_mtgp32_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("curand_normal", ("hiprand_normal", CONV_DEVICE_FUNC, API_RAND)),
-    ("curandMakeMTGP32Constants", ("hiprandMakeMTGP32Constants", CONV_DEVICE_FUNC, API_RAND)),
-    ("curandMakeMTGP32KernelState", ("hiprandMakeMTGP32KernelState", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_normal_double", ("hiprand_normal_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_normal2", ("hiprand_normal2", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_normal2_double", ("hiprand_normal2_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_normal4", ("hiprand_normal4", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_normal4_double", ("hiprand_normal4_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_uniform", ("hiprand_uniform", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_uniform_double", ("hiprand_uniform_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_uniform2_double", ("hiprand_uniform2_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_uniform4", ("hiprand_uniform4", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_uniform4_double", ("hiprand_uniform4_double", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_discrete", ("hiprand_discrete", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_discrete4", ("hiprand_discrete4", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_poisson", ("hiprand_poisson", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_poisson4", ("hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND)),
-    ("curand_Philox4x32_10", ("hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED)),
-    ("mtgp32_kernel_params", ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND)),
-    ("CUFFT_FORWARD", ("HIPFFT_FORWARD", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUFFT_INVERSE", ("HIPFFT_BACKWARD", CONV_NUMERIC_LITERAL, API_BLAS)),
-    ("CUFFT_COMPATIBILITY_DEFAULT", ("HIPFFT_COMPATIBILITY_DEFAULT", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED)),
-    ("cufftResult_t", ("hipfftResult_t", CONV_TYPE, API_FFT)),
-    ("cufftResult", ("hipfftResult", CONV_TYPE, API_FFT)),
-    ("CUFFT_SUCCESS", ("HIPFFT_SUCCESS", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_INVALID_PLAN", ("HIPFFT_INVALID_PLAN", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_ALLOC_FAILED", ("HIPFFT_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_INVALID_TYPE", ("HIPFFT_INVALID_TYPE", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_INVALID_VALUE", ("HIPFFT_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_INTERNAL_ERROR", ("HIPFFT_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_EXEC_FAILED", ("HIPFFT_EXEC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_SETUP_FAILED", ("HIPFFT_SETUP_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_INVALID_SIZE", ("HIPFFT_INVALID_SIZE", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_UNALIGNED_DATA", ("HIPFFT_UNALIGNED_DATA", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_INCOMPLETE_PARAMETER_LIST", ("HIPFFT_INCOMPLETE_PARAMETER_LIST", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_INVALID_DEVICE", ("HIPFFT_INVALID_DEVICE", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_PARSE_ERROR", ("HIPFFT_PARSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_NO_WORKSPACE", ("HIPFFT_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_NOT_IMPLEMENTED", ("HIPFFT_NOT_IMPLEMENTED", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_LICENSE_ERROR", ("HIPFFT_LICENSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED)),
-    ("CUFFT_NOT_SUPPORTED", ("HIPFFT_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("cufftType_t", ("hipfftType_t", CONV_TYPE, API_FFT)),
-    ("cufftType", ("hipfftType", CONV_TYPE, API_FFT)),
-    ("CUFFT_R2C", ("HIPFFT_R2C", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_C2R", ("HIPFFT_C2R", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_C2C", ("HIPFFT_C2C", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_D2Z", ("HIPFFT_D2Z", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_Z2D", ("HIPFFT_Z2D", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("CUFFT_Z2Z", ("HIPFFT_Z2Z", CONV_NUMERIC_LITERAL, API_FFT)),
-    ("cufftCompatibility_t", ("hipfftCompatibility_t", CONV_TYPE, API_FFT, HIP_UNSUPPORTED)),
-    ("cufftCompatibility", ("hipfftCompatibility", CONV_TYPE, API_FFT, HIP_UNSUPPORTED)),
-    ("CUFFT_COMPATIBILITY_FFTW_PADDING", ("HIPFFT_COMPATIBILITY_FFTW_PADDING", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED)),
-    ("cufftReal", ("hipfftReal", CONV_TYPE, API_FFT)),
-    ("cufftDoubleReal", ("hipfftDoubleReal", CONV_TYPE, API_FFT)),
-    ("cufftComplex", ("hipfftComplex", CONV_TYPE, API_FFT)),
-    ("cufftDoubleComplex", ("hipfftDoubleComplex", CONV_TYPE, API_FFT)),
-    ("cufftHandle", ("hipfftHandle", CONV_TYPE, API_FFT)),
-    ("cufftPlan1d", ("hipfftPlan1d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftPlan2d", ("hipfftPlan2d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftPlan3d", ("hipfftPlan3d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftPlanMany", ("hipfftPlanMany", CONV_MATH_FUNC, API_FFT)),
-    ("cufftMakePlan1d", ("hipfftMakePlan1d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftMakePlan2d", ("hipfftMakePlan2d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftMakePlan3d", ("hipfftMakePlan3d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftMakePlanMany", ("hipfftMakePlanMany", CONV_MATH_FUNC, API_FFT)),
-    ("cufftMakePlanMany64", ("hipfftMakePlanMany64", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetSizeMany64", ("hipfftGetSizeMany64", CONV_MATH_FUNC, API_FFT)),
-    ("cufftEstimate1d", ("hipfftEstimate1d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftEstimate2d", ("hipfftEstimate2d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftEstimate3d", ("hipfftEstimate3d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftEstimateMany", ("hipfftEstimateMany", CONV_MATH_FUNC, API_FFT)),
-    ("cufftCreate", ("hipfftCreate", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetSize1d", ("hipfftGetSize1d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetSize2d", ("hipfftGetSize2d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetSize3d", ("hipfftGetSize3d", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetSizeMany", ("hipfftGetSizeMany", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetSize", ("hipfftGetSize", CONV_MATH_FUNC, API_FFT)),
-    ("cufftSetWorkArea", ("hipfftSetWorkArea", CONV_MATH_FUNC, API_FFT)),
-    ("cufftSetAutoAllocation", ("hipfftSetAutoAllocation", CONV_MATH_FUNC, API_FFT)),
-    ("cufftExecC2C", ("hipfftExecC2C", CONV_MATH_FUNC, API_FFT)),
-    ("cufftExecR2C", ("hipfftExecR2C", CONV_MATH_FUNC, API_FFT)),
-    ("cufftExecC2R", ("hipfftExecC2R", CONV_MATH_FUNC, API_FFT)),
-    ("cufftExecZ2Z", ("hipfftExecZ2Z", CONV_MATH_FUNC, API_FFT)),
-    ("cufftExecD2Z", ("hipfftExecD2Z", CONV_MATH_FUNC, API_FFT)),
-    ("cufftExecZ2D", ("hipfftExecZ2D", CONV_MATH_FUNC, API_FFT)),
-    ("cufftSetStream", ("hipfftSetStream", CONV_MATH_FUNC, API_FFT)),
-    ("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)),
-    ("cufftGetProperty", ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED)),
-    ("nvrtcResult", ("hiprtcResult", CONV_TYPE, API_RTC)),
-    ("NVRTC_SUCCESS", ("HIPRTC_SUCCESS", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_OUT_OF_MEMORY", ("HIPRTC_ERROR_OUT_OF_MEMORY", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_PROGRAM_CREATION_FAILURE", ("HIPRTC_ERROR_PROGRAM_CREATION_FAILURE", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_INVALID_INPUT", ("HIPRTC_ERROR_INVALID_INPUT", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_INVALID_PROGRAM", ("HIPRTC_ERROR_INVALID_PROGRAM", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_COMPILATION", ("HIPRTC_ERROR_COMPILATION", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_BUILTIN_OPERATION_FAILURE", ("HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", ("HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID", ("HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID", CONV_TYPE, API_RTC)),
-    ("NVRTC_ERROR_INTERNAL_ERROR", ("HIPRTC_ERROR_INTERNAL_ERROR", CONV_TYPE, API_RTC)),
-    ("nvrtcGetErrorString", ("hiprtcGetErrorString", CONV_JIT, API_RTC)),
-    ("nvrtcVersion", ("hiprtcVersion", CONV_JIT, API_RTC)),
-    ("nvrtcProgram", ("hiprtcProgram", CONV_TYPE, API_RTC)),
-    ("nvrtcAddNameExpression", ("hiprtcAddNameExpression", CONV_JIT, API_RTC)),
-    ("nvrtcCompileProgram", ("hiprtcCompileProgram", CONV_JIT, API_RTC)),
-    ("nvrtcCreateProgram", ("hiprtcCreateProgram", CONV_JIT, API_RTC)),
-    ("nvrtcDestroyProgram", ("hiprtcDestroyProgram", CONV_JIT, API_RTC)),
-    ("nvrtcGetLoweredName", ("hiprtcGetLoweredName", CONV_JIT, API_RTC)),
-    ("nvrtcGetProgramLog", ("hiprtcGetProgramLog", CONV_JIT, API_RTC)),
-    ("nvrtcGetProgramLogSize", ("hiprtcGetProgramLogSize", CONV_JIT, API_RTC)),
-    ("nvrtcGetPTX", ("hiprtcGetCode", CONV_JIT, API_RTC)),
-    ("nvrtcGetPTXSize", ("hiprtcGetCodeSize", CONV_JIT, API_RTC)),
-    ("thrust::cuda", ("thrust::hip", CONV_MATH_FUNC, API_BLAS)),
-    ("cub::", ("hipcub::", CONV_MATH_FUNC, API_BLAS)),
-    ("nvtxMark", ("roctxMark", CONV_OTHER, API_ROCTX)),
-    ("nvtxMarkA", ("roctxMarkA", CONV_OTHER, API_ROCTX)),
-    ("nvtxRangePushA", ("roctxRangePushA", CONV_OTHER, API_ROCTX)),
-    ("nvtxRangePop", ("roctxRangePop", CONV_OTHER, API_ROCTX)),
-])
-
-CUDA_SPARSE_MAP = collections.OrderedDict([
-    ("cusparseStatus_t", ("hipsparseStatus_t", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseHandle_t", ("hipsparseHandle_t", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseOperation_t", ("hipsparseOperation_t", CONV_TYPE, API_SPARSE)),
-    ("cusparseCreateMatDescr", ("hipsparseCreateMatDescr", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseCreate", ("hipsparseCreate", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseDestroyMatDescr", ("hipsparseDestroyMatDescr", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseDestroy", ("hipsparseDestroy", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseXcoo2csr", ("hipsparseXcoo2csr", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseMatDescr_t", ("hipsparseMatDescr_t", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseScsrmm2", ("hipsparseScsrmm2", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseDcsrmm2", ("hipsparseDcsrmm2", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseScsrmm", ("hipsparseScsrmm", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseDcsrmm", ("hipsparseDcsrmm", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseXcsrsort_bufferSizeExt", ("hipsparseXcsrsort_bufferSizeExt", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseXcsrsort", ("hipsparseXcsrsort", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseXcoosort_bufferSizeExt", ("hipsparseXcoosort_bufferSizeExt", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseXcoosortByRow", ("hipsparseXcoosortByRow", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseSetStream", ("hipsparseSetStream", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseCreateIdentityPermutation", ("hipsparseCreateIdentityPermutation", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseSetMatIndexBase", ("hipsparseSetMatIndexBase", CONV_MATH_FUNC, API_SPARSE)),
-    ("cusparseSetMatType", ("hipsparseSetMatType", CONV_MATH_FUNC, API_SPARSE)),
-    ("CUSPARSE_STATUS_SUCCESS", ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_NOT_INITIALIZED", ("HIPSPARSE_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_ALLOC_FAILED", ("HIPSPARSE_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_INVALID_VALUE", ("HIPSPARSE_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_MAPPING_ERROR", ("HIPSPARSE_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_EXECUTION_FAILED", ("HIPSPARSE_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_INTERNAL_ERROR", ("HIPSPARSE_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED", ("HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_ARCH_MISMATCH", ("HIPSPARSE_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_STATUS_ZERO_PIVOT", ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_OPERATION_TRANSPOSE", ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_OPERATION_NON_TRANSPOSE", ("HIPSPARSE_OPERATION_NON_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE", ("HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_INDEX_BASE_ZERO", ("HIPSPARSE_INDEX_BASE_ZERO", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_INDEX_BASE_ONE", ("HIPSPARSE_INDEX_BASE_ONE", CONV_NUMERIC_LITERAL, API_SPARSE)),
-    ("CUSPARSE_MATRIX_TYPE_GENERAL", ("HIPSPARSE_MATRIX_TYPE_GENERAL", CONV_NUMERIC_LITERAL, API_SPARSE)),
-])
-
-PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict([
-    ("USE_CUDA", ("USE_ROCM", API_PYTORCH)),
-    ("CUDA_VERSION", ("HIP_VERSION", API_PYTORCH)),
-    ("cudaHostAllocator", ("hipHostAllocator", API_PYTORCH)),
-    ("cudaDeviceAllocator", ("hipDeviceAllocator", API_PYTORCH)),
-    ("define MAX_NUM_BLOCKS 200", ("define MAX_NUM_BLOCKS 64", API_PYTORCH)),
-
-    ("cuda::CUDAGuard", ("hip::HIPGuardMasqueradingAsCUDA", API_PYTORCH)),
-    ("CUDAGuard", ("HIPGuardMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::OptionalCUDAGuard", ("hip::OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH)),
-    ("OptionalCUDAGuard", ("OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::CUDAStreamGuard", ("hip::HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH)),
-    ("CUDAStreamGuard", ("HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::OptionalCUDAStreamGuard", ("hip::OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH)),
-    ("OptionalCUDAStreamGuard", ("OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH)),
-
-    # Only get needs to be transformed this way; all the other ones can go
-    # straight to the normal versions hip::HIPCachingAllocator
-    ("cuda::CUDACachingAllocator::get", ("hip::HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH)),
-    ("CUDACachingAllocator::get", ("HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH)),
-    ("cuda::CUDACachingAllocator::recordStream", ("hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA", API_PYTORCH)),
-    ("CUDACachingAllocator::recordStream", ("HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::CUDAStream", ("hip::HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-    ("CUDAStream", ("HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::getStreamFromPool", ("hip::getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH)),
-    ("getStreamFromPool", ("getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::getDefaultCUDAStream", ("hip::getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-    ("getDefaultCUDAStream", ("getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::getCurrentCUDAStream", ("hip::getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-    ("getCurrentCUDAStream", ("getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-
-    ("cuda::setCurrentCUDAStream", ("hip::setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-    ("setCurrentCUDAStream", ("setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH)),
-
-    # TODO: Undo this special-case; see the header for motivation behind this
-    # hack.  It's VERY important this is only applied to PyTorch HIPify.
-    ("c10/cuda/CUDAGuard.h", ("ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h", API_PYTORCH)),
-    ("c10/cuda/CUDACachingAllocator.h", ("ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h", API_PYTORCH)),
-    ("c10/cuda/CUDAStream.h", ("ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h", API_PYTORCH)),
-    ("gloo/cuda.h", ("gloo/hip.h", API_PYTORCH)),
-    ("gloo/cuda_allreduce_halving_doubling.h", ("gloo/hip_allreduce_halving_doubling.h", API_PYTORCH)),
-    ("gloo/cuda_allreduce_halving_doubling_pipelined.h", ("gloo/hip_allreduce_halving_doubling_pipelined.h", API_PYTORCH)),
-    ("gloo/cuda_allreduce_ring.h", ("gloo/hip_allreduce_ring.h", API_PYTORCH)),
-    ("gloo/cuda_broadcast_one_to_all.h", ("gloo/hip_broadcast_one_to_all.h", API_PYTORCH)),
-    ("gloo::CudaAllreduceHalvingDoublingPipelined", ("gloo::HipAllreduceHalvingDoublingPipelined", API_PYTORCH)),
-    ("gloo::CudaBroadcastOneToAll", ("gloo::HipBroadcastOneToAll", API_PYTORCH)),
-    ("gloo::CudaHostWorkspace", ("gloo::HipHostWorkspace", API_PYTORCH)),
-    ("gloo::CudaDeviceWorkspace", ("gloo::HipDeviceWorkspace", API_PYTORCH)),
-])
-
-CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict([
-    ("cuda_stream" , ("hip_stream", API_CAFFE2)),
-    # if the header is a native hip folder (under hip directory),
-    # there is no need to add a hip path to it; the trie in hipify script
-    # takes this mapping order to forbid further replacement
-    ("/hip/" , ("/hip/", API_CAFFE2)),
-    ("/context_gpu" , ("/hip/context_gpu", API_CAFFE2)),
-    ("/common_gpu"  , ("/hip/common_gpu", API_CAFFE2)),
-    ("/mixed_utils" , ("/hip/mixed_utils", API_CAFFE2)),
-    ("/operator_fallback_gpu" , ("/hip/operator_fallback_gpu", API_CAFFE2)),
-    ("/spatial_batch_norm_op_impl" , ("/hip/spatial_batch_norm_op_impl", API_CAFFE2)),
-    ("/recurrent_network_executor_gpu" , ("/hip/recurrent_network_executor_gpu", API_CAFFE2)),
-    ("/generate_proposals_op_util_nms_gpu" , ("/hip/generate_proposals_op_util_nms_gpu", API_CAFFE2)),
-    ("/max_pool_with_index_gpu", ("/hip/max_pool_with_index_gpu", API_CAFFE2)),
-    ("/THCCachingAllocator_gpu", ("/hip/THCCachingAllocator_gpu", API_CAFFE2)),
-    ("/top_k_heap_selection", ("/hip/top_k_heap_selection", API_CAFFE2)),
-    ("/top_k_radix_selection", ("/hip/top_k_radix_selection", API_CAFFE2)),
-    ("/GpuDefs", ("/hip/GpuDefs", API_CAFFE2)),
-    ("/GpuScanUtils", ("/hip/GpuScanUtils", API_CAFFE2)),
-    ("/GpuBitonicSort", ("/hip/GpuBitonicSort", API_CAFFE2)),
-    ("/math/reduce.cuh", ("/math/hip/reduce.cuh", API_CAFFE2)),
-    ("/gather_op.cuh", ("/hip/gather_op.cuh", API_CAFFE2)),
-    ("caffe2/core/common_cudnn.h", ("caffe2/core/hip/common_miopen.h", API_CAFFE2)),
-    ("REGISTER_CUDA_OPERATOR" , ("REGISTER_HIP_OPERATOR", API_CAFFE2)),
-    ("CUDA_1D_KERNEL_LOOP" , ("HIP_1D_KERNEL_LOOP", API_CAFFE2)),
-    ("CUDAContext" , ("HIPContext", API_CAFFE2)),
-    ("CAFFE_CUDA_NUM_THREADS" , ("CAFFE_HIP_NUM_THREADS", API_CAFFE2)),
-    ("HasCudaGPU" , ("HasHipGPU", API_CAFFE2)),
-    ("__expf" , ("expf", API_CAFFE2)),
-    ("CUBLAS_ENFORCE" , ("ROCBLAS_ENFORCE", API_CAFFE2)),
-    ("CUBLAS_CHECK" , ("ROCBLAS_CHECK", API_CAFFE2)),
-    ("cublas_handle" , ("rocblashandle", API_CAFFE2)),
-    ("CURAND_ENFORCE" ,("HIPRAND_ENFORCE", API_CAFFE2)),
-    ("CURAND_CHECK" ,("HIPRAND_CHECK", API_CAFFE2)),
-    ("curandGenerateUniform" , ("hiprandGenerateUniform", API_CAFFE2)),
-    ("curand_generator" , ("hiprand_generator", API_CAFFE2)),
-    ("CaffeCudaGetDevice" , ("CaffeHipGetDevice", API_CAFFE2)),
-    ("CUDA" ,("HIP", API_CAFFE2)),
-    ("Cuda" ,("Hip", API_CAFFE2)),
-    ("cuda_" ,("hip_", API_CAFFE2)),
-    ("_cuda" ,("_hip", API_CAFFE2)),
-    ("CUDNN" ,("MIOPEN", API_CAFFE2)),
-    ("CuDNN" ,("MIOPEN", API_CAFFE2)),
-    ("cudnn" ,("miopen", API_CAFFE2)),
-    ("namespace cuda", ("namespace hip", API_CAFFE2)),
-    ("cuda::CUDAGuard", ("hip::HIPGuard", API_CAFFE2)),
-    ("cuda::OptionalCUDAGuard", ("hip::OptionalHIPGuard", API_CAFFE2)),
-    ("cuda::CUDAStreamGuard", ("hip::HIPStreamGuard", API_CAFFE2)),
-    ("cuda::OptionalCUDAStreamGuard", ("hip::OptionalHIPStreamGuard", API_CAFFE2)),
-    ("c10/cuda/CUDAGuard.h", ("c10/hip/HIPGuard.h", API_CAFFE2)),
-    ("gloo/cuda", ("gloo/hip", API_CAFFE2)),
-])
-
-# We must tread very carefully here.  Blanket conversions like are done
-# in CAFFE2_SPECIFIC_MAPPINGS are not presently supported on PyTorch,
-# because a regex for CUDA will also match a filename like CUDAGuard.h,
-# but the HIPIFY script doesn't presently move the file and so the substitution
-# will be invalid.  Instead, we specifically list out every identifier
-# and file from c10/cuda which may be used externally, and do substitutions this
-# way.
-#
-# NB: if you want a transformation to ONLY apply to the c10/ directory,
-# put it as API_CAFFE2
-C10_MAPPINGS = collections.OrderedDict([
-    ("cuda::compat::", ("hip::compat::", API_C10)),
-    ("c10/cuda/CUDAException.h", ("c10/hip/HIPException.h", API_C10)),
-    ("c10/cuda/CUDAMacros.h", ("c10/hip/HIPMacros.h", API_C10)),
-    ("c10/cuda/CUDAMathCompat.h", ("c10/hip/HIPMathCompat.h", API_C10)),
-    ("c10/cuda/CUDAFunctions.h", ("c10/hip/HIPFunctions.h", API_C10)),
-    ("c10/cuda/CUDAStream.h", ("c10/hip/HIPStream.h", API_C10)),
-    ("c10/cuda/CUDACachingAllocator.h", ("c10/hip/HIPCachingAllocator.h", API_C10)),
-    ("c10/cuda/impl/CUDATest.h", ("c10/hip/impl/HIPTest.h", API_C10)),
-    ("c10/cuda/impl/CUDAGuardImpl.h", ("c10/hip/impl/HIPGuardImpl.h", API_C10)),
-    ("c10/cuda/impl/cuda_cmake_macros.h", ("c10/hip/impl/hip_cmake_macros.h", API_C10)),
-    ("C10_CUDA_CHECK", ("C10_HIP_CHECK", API_C10)),
-    ("C10_CUDA_CHECK_WARN", ("C10_HIP_CHECK_WARN", API_C10)),
-    ("c10::cuda", ("c10::hip", API_C10)),
-    ("cuda::CUDAStream", ("hip::HIPStream", API_C10)),
-    ("CUDAStream", ("HIPStream", API_C10)),
-    # This substitution is not permissible, because there's another copy of this
-    # function in torch/cuda.h
-    # ("cuda::device_count", ("hip::device_count", API_C10)),
-    ("cuda::current_device", ("hip::current_device", API_C10)),
-    ("cuda::set_device", ("hip::set_device", API_C10)),
-    ("cuda::getStreamFromPool", ("hip::getStreamFromPool", API_C10)),
-    ("getStreamFromPool", ("getStreamFromPool", API_C10)),
-    ("cuda::getDefaultCUDAStream", ("hip::getDefaultHIPStream", API_C10)),
-    ("getDefaultCUDAStream", ("getDefaultHIPStream", API_C10)),
-    ("cuda::getCurrentCUDAStream", ("hip::getCurrentHIPStream", API_C10)),
-    ("getCurrentCUDAStream", ("getCurrentHIPStream", API_C10)),
-    ("cuda::setCurrentCUDAStream", ("hip::setCurrentHIPStream", API_C10)),
-    ("setCurrentCUDAStream", ("setCurrentHIPStream", API_C10)),
-    ("cuda::CUDACachingAllocator", ("hip::HIPCachingAllocator", API_C10)),
-    ("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)),
-])
-
-# NB: C10 mappings are more specific than Caffe2 mappings, so run them
-# first
-CUDA_TO_HIP_MAPPINGS = [CUDA_IDENTIFIER_MAP, CUDA_TYPE_NAME_MAP,
-                        CUDA_INCLUDE_MAP, CUDA_SPARSE_MAP, C10_MAPPINGS, PYTORCH_SPECIFIC_MAPPINGS, CAFFE2_SPECIFIC_MAPPINGS]
diff --git a/tools/amd_build/pyHIPIFY/__init__.py b/torch/utils/hipify/__init__.py
similarity index 100%
rename from tools/amd_build/pyHIPIFY/__init__.py
rename to torch/utils/hipify/__init__.py
diff --git a/tools/amd_build/pyHIPIFY/constants.py b/torch/utils/hipify/constants.py
similarity index 100%
rename from tools/amd_build/pyHIPIFY/constants.py
rename to torch/utils/hipify/constants.py
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
new file mode 100644
index 0000000000000..54848b8888402
--- /dev/null
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -0,0 +1,8083 @@
+#!/usr/bin/env python3
+import collections
+
+from .constants import *
+
+""" Mapping of CUDA functions, include files, constants, and types to ROCm/HIP equivalents
+This closely follows the implementation in hipify-clang
+https://github.com/ROCm-Developer-Tools/HIP/blob/master/hipify-clang/src/CUDA2HipMap.cpp
+and its structure.
+There are different maps for fundamental names, include files, identifies, sparse, and
+PyTorch specific translations.
+Each of the entries in these maps translates a CUDA string to a tuple containing the
+ROCm/HIP string, a type and API annotation and - optionally - an annotation if it is not
+supported in ROCm/HIP yet.
+"""
+
+# List of math functions that should be replaced inside device code only.
+MATH_TRANSPILATIONS = collections.OrderedDict(
+    [
+        ("std::max", ("::max")),
+        ("std::min", ("::min")),
+        ("std::ceil", ("::ceil")),
+        ("std::floor", ("::floor")),
+        ("std::exp", ("::exp")),
+        ("std::log", ("::log")),
+        ("std::pow", ("::pow")),
+        ("std::fabs", ("::fabs")),
+        ("std::fmod", ("::fmod")),
+        ("std::remainder", ("::remainder")),
+    ]
+)
+
+CUDA_TYPE_NAME_MAP = collections.OrderedDict(
+    [
+        ("CUresult", ("hipError_t", CONV_TYPE, API_DRIVER)),
+        ("cudaError_t", ("hipError_t", CONV_TYPE, API_RUNTIME)),
+        (
+            "CUDA_ARRAY3D_DESCRIPTOR",
+            ("HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ARRAY_DESCRIPTOR", ("HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER)),
+        ("CUDA_MEMCPY2D", ("hip_Memcpy2D", CONV_TYPE, API_DRIVER)),
+        ("CUDA_MEMCPY3D", ("HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUDA_MEMCPY3D_PEER",
+            ("HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_POINTER_ATTRIBUTE_P2P_TOKENS",
+            (
+                "HIP_POINTER_ATTRIBUTE_P2P_TOKENS",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUDA_RESOURCE_DESC",
+            ("HIP_RESOURCE_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_RESOURCE_VIEW_DESC",
+            ("HIP_RESOURCE_VIEW_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUipcEventHandle",
+            ("hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUipcMemHandle", ("hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUaddress_mode", ("hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUarray_cubemap_face",
+            ("hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUarray_format", ("hipArray_format", CONV_TYPE, API_DRIVER)),
+        ("CUcomputemode", ("hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUmem_advise", ("hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUmem_range_attribute",
+            ("hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUctx_flags", ("hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUdevice", ("hipDevice_t", CONV_TYPE, API_DRIVER)),
+        ("CUdevice_attribute_enum", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)),
+        ("CUdevice_attribute", ("hipDeviceAttribute_t", CONV_TYPE, API_DRIVER)),
+        ("CUdeviceptr", ("hipDeviceptr_t", CONV_TYPE, API_DRIVER)),
+        ("CUarray_st", ("hipArray", CONV_TYPE, API_DRIVER)),
+        ("CUarray", ("hipArray *", CONV_TYPE, API_DRIVER)),
+        ("CUdevprop_st", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)),
+        ("CUdevprop", ("hipDeviceProp_t", CONV_TYPE, API_DRIVER)),
+        ("CUfunction", ("hipFunction_t", CONV_TYPE, API_DRIVER)),
+        (
+            "CUgraphicsResource",
+            ("hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUmipmappedArray",
+            ("hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUfunction_attribute",
+            ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUfunction_attribute_enum",
+            ("hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsMapResourceFlags",
+            ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsMapResourceFlags_enum",
+            ("hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsRegisterFlags",
+            ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUgraphicsRegisterFlags_enum",
+            ("hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUoccupancy_flags",
+            ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUoccupancy_flags_enum",
+            ("hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUfunc_cache_enum", ("hipFuncCache", CONV_TYPE, API_DRIVER)),
+        ("CUfunc_cache", ("hipFuncCache", CONV_TYPE, API_DRIVER)),
+        ("CUipcMem_flags", ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUipcMem_flags_enum",
+            ("hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUjit_cacheMode", ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUjit_cacheMode_enum",
+            ("hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUjit_fallback", ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUjit_fallback_enum",
+            ("hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUjit_option", ("hipJitOption", CONV_JIT, API_DRIVER)),
+        ("CUjit_option_enum", ("hipJitOption", CONV_JIT, API_DRIVER)),
+        ("CUjit_target", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUjit_target_enum", ("hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUjitInputType", ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUjitInputType_enum",
+            ("hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUlimit", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
+        ("CUlimit_enum", ("hipLimit_t", CONV_TYPE, API_DRIVER)),
+        (
+            "CUmemAttach_flags",
+            ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUmemAttach_flags_enum",
+            ("hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUmemorytype", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUmemorytype_enum", ("hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUresourcetype", ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUresourcetype_enum",
+            ("hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUresourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)),
+        ("CUresourceViewFormat_enum", ("hipResourceViewFormat", CONV_TEX, API_DRIVER)),
+        ("CUsharedconfig", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)),
+        ("CUsharedconfig_enum", ("hipSharedMemConfig", CONV_TYPE, API_DRIVER)),
+        ("CUcontext", ("hipCtx_t", CONV_TYPE, API_DRIVER)),
+        ("CUmodule", ("hipModule_t", CONV_TYPE, API_DRIVER)),
+        ("CUstream", ("hipStream_t", CONV_TYPE, API_DRIVER)),
+        ("CUstream_st", ("ihipStream_t", CONV_TYPE, API_DRIVER)),
+        ("CUstreamCallback", ("hipStreamCallback_t", CONV_TYPE, API_DRIVER)),
+        ("CUsurfObject", ("hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUsurfref",
+            ("hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUtexObject", ("hipTextureObject_t", CONV_TYPE, API_DRIVER)),
+        ("CUtexref", ("textureReference", CONV_TYPE, API_DRIVER)),
+        ("CUstream_flags", ("hipStreamFlags", CONV_TYPE, API_DRIVER)),
+        (
+            "CUstreamWaitValue_flags",
+            ("hipStreamWaitValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUstreamWriteValue_flags",
+            ("hipStreamWriteValueFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUstreamBatchMemOpType",
+            ("hipStreamBatchMemOpType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUdevice_P2PAttribute",
+            ("hipDeviceP2PAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUevent", ("hipEvent_t", CONV_TYPE, API_DRIVER)),
+        ("CUevent_st", ("ihipEvent_t", CONV_TYPE, API_DRIVER)),
+        ("CUevent_flags", ("hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUfilter_mode", ("hipTextureFilterMode", CONV_TEX, API_DRIVER)),
+        ("CUGLDeviceList", ("hipGLDeviceList", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        ("CUGLmap_flags", ("hipGLMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUd3d9DeviceList",
+            ("hipD3D9DeviceList", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d9map_flags",
+            ("hipD3D9MapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d9register_flags",
+            ("hipD3D9RegisterFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d10DeviceList",
+            ("hipd3d10DeviceList", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d10map_flags",
+            ("hipD3D10MapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d10register_flags",
+            ("hipD3D10RegisterFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUd3d11DeviceList",
+            ("hipd3d11DeviceList", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUeglStreamConnection_st",
+            ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUeglStreamConnection",
+            ("hipEglStreamConnection", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "libraryPropertyType_t",
+            ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "libraryPropertyType",
+            ("hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaStreamCallback_t", ("hipStreamCallback_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaArray", ("hipArray", CONV_MEM, API_RUNTIME)),
+        ("cudaArray_t", ("hipArray_t", CONV_MEM, API_RUNTIME)),
+        ("cudaArray_const_t", ("hipArray_const_t", CONV_MEM, API_RUNTIME)),
+        ("cudaMipmappedArray_t", ("hipMipmappedArray_t", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMipmappedArray_const_t",
+            ("hipMipmappedArray_const_t", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaArrayDefault", ("hipArrayDefault", CONV_MEM, API_RUNTIME)),
+        ("cudaArrayLayered", ("hipArrayLayered", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaArraySurfaceLoadStore",
+            ("hipArraySurfaceLoadStore", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaArrayCubemap", ("hipArrayCubemap", CONV_MEM, API_RUNTIME)),
+        ("cudaArrayTextureGather", ("hipArrayTextureGather", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryAdvise", ("hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaMemRangeAttribute",
+            ("hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpyKind", ("hipMemcpyKind", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryType", ("hipMemoryType", CONV_MEM, API_RUNTIME)),
+        ("cudaExtent", ("hipExtent", CONV_MEM, API_RUNTIME)),
+        ("cudaPitchedPtr", ("hipPitchedPtr", CONV_MEM, API_RUNTIME)),
+        ("cudaPos", ("hipPos", CONV_MEM, API_RUNTIME)),
+        ("cudaEvent_t", ("hipEvent_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaStream_t", ("hipStream_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaPointerAttributes", ("hipPointerAttribute_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceAttr", ("hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceProp", ("hipDeviceProp_t", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDeviceP2PAttr",
+            ("hipDeviceP2PAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeMode",
+            ("hipComputeMode", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaFuncCache", ("hipFuncCache_t", CONV_CACHE, API_RUNTIME)),
+        (
+            "cudaFuncAttributes",
+            ("hipFuncAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaSharedMemConfig", ("hipSharedMemConfig", CONV_TYPE, API_RUNTIME)),
+        ("cudaLimit", ("hipLimit_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaOutputMode", ("hipOutputMode", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaTextureReadMode", ("hipTextureReadMode", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureFilterMode", ("hipTextureFilterMode", CONV_TEX, API_RUNTIME)),
+        ("cudaChannelFormatKind", ("hipChannelFormatKind", CONV_TEX, API_RUNTIME)),
+        ("cudaChannelFormatDesc", ("hipChannelFormatDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceDesc", ("hipResourceDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceViewDesc", ("hipResourceViewDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureDesc", ("hipTextureDesc", CONV_TEX, API_RUNTIME)),
+        (
+            "surfaceReference",
+            ("hipSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaTextureObject_t", ("hipTextureObject_t", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceType", ("hipResourceType", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceViewFormat", ("hipResourceViewFormat", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureAddressMode", ("hipTextureAddressMode", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaSurfaceBoundaryMode",
+            ("hipSurfaceBoundaryMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaSurfaceFormatMode",
+            ("hipSurfaceFormatMode", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaTextureType1D", ("hipTextureType1D", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureType2D", ("hipTextureType2D", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureType3D", ("hipTextureType3D", CONV_TEX, API_RUNTIME)),
+        ("cudaTextureTypeCubemap", ("hipTextureTypeCubemap", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaTextureType1DLayered",
+            ("hipTextureType1DLayered", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaTextureType2DLayered",
+            ("hipTextureType2DLayered", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaTextureTypeCubemapLayered",
+            ("hipTextureTypeCubemapLayered", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaIpcEventHandle_t", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaIpcEventHandle_st", ("hipIpcEventHandle_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaIpcMemHandle_t", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)),
+        ("cudaIpcMemHandle_st", ("hipIpcMemHandle_t", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaGraphicsCubeFace",
+            ("hipGraphicsCubeFace", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsMapFlags",
+            ("hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsRegisterFlags",
+            ("hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLDeviceList",
+            ("hipGLDeviceList", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaGLMapFlags", ("hipGLMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaD3D9DeviceList",
+            ("hipD3D9DeviceList", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9MapFlags",
+            ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9RegisterFlags",
+            ("hipD3D9RegisterFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10DeviceList",
+            ("hipd3d10DeviceList", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10MapFlags",
+            ("hipD3D10MapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10RegisterFlags",
+            ("hipD3D10RegisterFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11DeviceList",
+            ("hipd3d11DeviceList", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEglStreamConnection",
+            ("hipEglStreamConnection", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cublasHandle_t", ("rocblas_handle", CONV_TYPE, API_BLAS)),
+        ("cublasOperation_t", ("rocblas_operation", CONV_TYPE, API_BLAS)),
+        ("cublasStatus_t", ("rocblas_status", CONV_TYPE, API_BLAS)),
+        ("cublasFillMode_t", ("rocblas_fill", CONV_TYPE, API_BLAS)),
+        ("cublasDiagType_t", ("rocblas_diagonal", CONV_TYPE, API_BLAS)),
+        ("cublasSideMode_t", ("rocblas_side", CONV_TYPE, API_BLAS)),
+        ("cublasPointerMode_t", ("rocblas_pointer_mode", CONV_TYPE, API_BLAS)),
+        (
+            "cublasAtomicsMode_t",
+            ("rocblas_atomics_mode", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDataType_t",
+            ("rocblas_data_type", CONV_TYPE, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("curandStatus", ("hiprandStatus_t", CONV_TYPE, API_RAND)),
+        ("curandStatus_t", ("hiprandStatus_t", CONV_TYPE, API_RAND)),
+        ("curandRngType", ("hiprandRngType_t", CONV_TYPE, API_RAND)),
+        ("curandRngType_t", ("hiprandRngType_t", CONV_TYPE, API_RAND)),
+        ("curandGenerator_st", ("hiprandGenerator_st", CONV_TYPE, API_RAND)),
+        ("curandGenerator_t", ("hiprandGenerator_t", CONV_TYPE, API_RAND)),
+        (
+            "curandDirectionVectorSet",
+            ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDirectionVectorSet_t",
+            ("hiprandDirectionVectorSet_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandOrdering", ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
+        (
+            "curandOrdering_t",
+            ("hiprandOrdering_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistribution_st",
+            ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2V_st",
+            ("hiprandDistribution_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistribution_t",
+            ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2V_t",
+            ("hiprandDistribution_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionShift_st",
+            ("hiprandDistributionShift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionShift_t",
+            ("hiprandDistributionShift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionM2Shift_st",
+            ("hiprandDistributionM2Shift_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDistributionM2Shift_t",
+            ("hiprandDistributionM2Shift_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2_st",
+            ("hiprandHistogramM2_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2_t",
+            ("hiprandHistogramM2_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2K_st",
+            ("hiprandHistogramM2K_st", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandHistogramM2K_t",
+            ("hiprandHistogramM2K_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandDiscreteDistribution_st",
+            ("hiprandDiscreteDistribution_st", CONV_TYPE, API_RAND),
+        ),
+        (
+            "curandDiscreteDistribution_t",
+            ("hiprandDiscreteDistribution_t", CONV_TYPE, API_RAND),
+        ),
+        ("curandMethod", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
+        ("curandMethod_t", ("hiprandMethod_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED)),
+        (
+            "curandDirectionVectors32_t",
+            ("hiprandDirectionVectors32_t", CONV_TYPE, API_RAND),
+        ),
+        (
+            "curandDirectionVectors64_t",
+            ("hiprandDirectionVectors64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandStateMtgp32_t", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)),
+        ("curandStateMtgp32", ("hiprandStateMtgp32_t", CONV_TYPE, API_RAND)),
+        (
+            "curandStateScrambledSobol64_t",
+            ("hiprandStateScrambledSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandStateSobol64_t",
+            ("hiprandStateSobol64_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandStateScrambledSobol32_t",
+            ("hiprandStateScrambledSobol32_t", CONV_TYPE, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandStateSobol32_t", ("hiprandStateSobol32_t", CONV_TYPE, API_RAND)),
+        ("curandStateMRG32k3a_t", ("hiprandStateMRG32k3a_t", CONV_TYPE, API_RAND)),
+        (
+            "curandStatePhilox4_32_10_t",
+            ("hiprandStatePhilox4_32_10_t", CONV_TYPE, API_RAND),
+        ),
+        ("curandStateXORWOW_t", ("hiprandStateXORWOW_t", CONV_TYPE, API_RAND)),
+        ("curandState_t", ("hiprandState_t", CONV_TYPE, API_RAND)),
+        ("curandState", ("hiprandState_t", CONV_TYPE, API_RAND)),
+    ]
+)
+
+CUDA_INCLUDE_MAP = collections.OrderedDict(
+    [
+        # since pytorch uses "\b{pattern}\b" as the actual re pattern,
+        # patterns listed here have to begin and end with alnum chars
+        (
+            "include <cuda.h",
+            ("include <hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER),
+        ),
+        (
+            'include "cuda.h',
+            ('include "hip/hip_runtime.h', CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER),
+        ),
+        (
+            "cuda_runtime.h",
+            ("hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME),
+        ),
+        ("cuda_runtime_api.h", ("hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME)),
+        (
+            "channel_descriptor.h",
+            ("hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME),
+        ),
+        ("device_functions.h", ("hip/device_functions.h", CONV_INCLUDE, API_RUNTIME)),
+        ("driver_types.h", ("hip/driver_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cuComplex.h", ("hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cuda_fp16.h", ("hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME)),
+        (
+            "cuda_texture_types.h",
+            ("hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME),
+        ),
+        ("vector_types.h", ("hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME)),
+        ("cublas.h", ("rocblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS)),
+        ("cublas_v2.h", ("rocblas.h", CONV_INCLUDE_CUDA_MAIN_H, API_BLAS)),
+        ("curand.h", ("hiprand.h", CONV_INCLUDE_CUDA_MAIN_H, API_RAND)),
+        ("curand_kernel.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_discrete.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_discrete2.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_globals.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_lognormal.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mrg32k3a.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mtgp32.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mtgp32_host.h", ("hiprand_mtgp32_host.h", CONV_INCLUDE, API_RAND)),
+        ("curand_mtgp32_kernel.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        (
+            "curand_mtgp32dc_p_11213.h",
+            ("rocrand_mtgp32_11213.h", CONV_INCLUDE, API_RAND),
+        ),
+        ("curand_normal.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_normal_static.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_philox4x32_x.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_poisson.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_precalc.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("curand_uniform.h", ("hiprand_kernel.h", CONV_INCLUDE, API_RAND)),
+        ("cusparse.h", ("hipsparse.h", CONV_INCLUDE, API_RAND)),
+        ("cufft.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
+        ("cufftXt.h", ("hipfft.h", CONV_INCLUDE, API_BLAS)),
+        # PyTorch also has a source file named "nccl.h", so we need to "<"">" to differentiate
+        ("<nccl.h>", ("<rccl.h>", CONV_INCLUDE, API_RUNTIME)),
+        ("nvrtc.h", ("hip/hiprtc.h", CONV_INCLUDE, API_RTC)),
+        ("thrust/system/cuda", ("thrust/system/hip", CONV_INCLUDE, API_BLAS)),
+        ("cub/util_allocator.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/block/block_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/cub.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/block/block_load.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/device/device_reduce.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("cub/device/device_scan.cuh", ("hipcub/hipcub.hpp", CONV_INCLUDE, API_BLAS)),
+        ("nvToolsExt.h", ("roctx.h", CONV_INCLUDE, API_ROCTX)),
+    ]
+)
+
+CUDA_IDENTIFIER_MAP = collections.OrderedDict(
+    [
+        ("__CUDACC__", ("__HIPCC__", CONV_DEF, API_RUNTIME)),
+        (
+            "CUDA_ERROR_INVALID_CONTEXT",
+            ("hipErrorInvalidContext", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_CONTEXT_ALREADY_CURRENT",
+            ("hipErrorContextAlreadyCurrent", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_ARRAY_IS_MAPPED",
+            ("hipErrorArrayIsMapped", CONV_TYPE, API_DRIVER),
+        ),
+        ("CUDA_ERROR_ALREADY_MAPPED", ("hipErrorAlreadyMapped", CONV_TYPE, API_DRIVER)),
+        (
+            "CUDA_ERROR_ALREADY_ACQUIRED",
+            ("hipErrorAlreadyAcquired", CONV_TYPE, API_DRIVER),
+        ),
+        ("CUDA_ERROR_NOT_MAPPED", ("hipErrorNotMapped", CONV_TYPE, API_DRIVER)),
+        (
+            "CUDA_ERROR_NOT_MAPPED_AS_ARRAY",
+            ("hipErrorNotMappedAsArray", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_NOT_MAPPED_AS_POINTER",
+            ("hipErrorNotMappedAsPointer", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CUDA_ERROR_CONTEXT_ALREADY_IN_USE",
+            ("hipErrorContextAlreadyInUse", CONV_TYPE, API_DRIVER),
+        ),
+        ("CUDA_ERROR_INVALID_SOURCE", ("hipErrorInvalidSource", CONV_TYPE, API_DRIVER)),
+        ("CUDA_ERROR_FILE_NOT_FOUND", ("hipErrorFileNotFound", CONV_TYPE, API_DRIVER)),
+        ("CUDA_ERROR_NOT_FOUND", ("hipErrorNotFound", CONV_TYPE, API_DRIVER)),
+        (
+            "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING",
+            (
+                "hipErrorLaunchIncompatibleTexturing",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE",
+            ("hipErrorPrimaryContextActive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_CONTEXT_IS_DESTROYED",
+            ("hipErrorContextIsDestroyed", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NOT_PERMITTED",
+            ("hipErrorNotPermitted", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NOT_SUPPORTED",
+            ("hipErrorNotSupported", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMissingConfiguration",
+            ("hipErrorMissingConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorPriorLaunchFailure",
+            ("hipErrorPriorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidDeviceFunction",
+            ("hipErrorInvalidDeviceFunction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidConfiguration",
+            ("hipErrorInvalidConfiguration", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidPitchValue",
+            ("hipErrorInvalidPitchValue", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidSymbol",
+            ("hipErrorInvalidSymbol", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidHostPointer",
+            ("hipErrorInvalidHostPointer", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidDevicePointer",
+            ("hipErrorInvalidDevicePointer", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaErrorInvalidTexture",
+            ("hipErrorInvalidTexture", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidTextureBinding",
+            ("hipErrorInvalidTextureBinding", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidChannelDescriptor",
+            (
+                "hipErrorInvalidChannelDescriptor",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaErrorInvalidMemcpyDirection",
+            ("hipErrorInvalidMemcpyDirection", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorAddressOfConstant",
+            ("hipErrorAddressOfConstant", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorTextureFetchFailed",
+            ("hipErrorTextureFetchFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorTextureNotBound",
+            ("hipErrorTextureNotBound", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorSynchronizationError",
+            ("hipErrorSynchronizationError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidFilterSetting",
+            ("hipErrorInvalidFilterSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidNormSetting",
+            ("hipErrorInvalidNormSetting", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMixedDeviceExecution",
+            ("hipErrorMixedDeviceExecution", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorNotYetImplemented",
+            ("hipErrorNotYetImplemented", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMemoryValueTooLarge",
+            ("hipErrorMemoryValueTooLarge", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInsufficientDriver",
+            ("hipErrorInsufficientDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorSetOnActiveProcess",
+            ("hipErrorSetOnActiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidSurface",
+            ("hipErrorInvalidSurface", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDuplicateVariableName",
+            ("hipErrorDuplicateVariableName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDuplicateTextureName",
+            ("hipErrorDuplicateTextureName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDuplicateSurfaceName",
+            ("hipErrorDuplicateSurfaceName", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorDevicesUnavailable",
+            ("hipErrorDevicesUnavailable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorIncompatibleDriverContext",
+            (
+                "hipErrorIncompatibleDriverContext",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaErrorDeviceAlreadyInUse",
+            ("hipErrorDeviceAlreadyInUse", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchMaxDepthExceeded",
+            ("hipErrorLaunchMaxDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchFileScopedTex",
+            ("hipErrorLaunchFileScopedTex", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchFileScopedSurf",
+            ("hipErrorLaunchFileScopedSurf", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorSyncDepthExceeded",
+            ("hipErrorSyncDepthExceeded", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchPendingCountExceeded",
+            (
+                "hipErrorLaunchPendingCountExceeded",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaErrorNotPermitted",
+            ("hipErrorNotPermitted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorNotSupported",
+            ("hipErrorNotSupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorStartupFailure",
+            ("hipErrorStartupFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorApiFailureBase",
+            ("hipErrorApiFailureBase", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_SUCCESS", ("hipSuccess", CONV_TYPE, API_DRIVER)),
+        ("cudaSuccess", ("hipSuccess", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_ERROR_INVALID_VALUE", ("hipErrorInvalidValue", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorInvalidValue", ("hipErrorInvalidValue", CONV_TYPE, API_RUNTIME)),
+        (
+            "CUDA_ERROR_OUT_OF_MEMORY",
+            ("hipErrorMemoryAllocation", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorMemoryAllocation",
+            ("hipErrorMemoryAllocation", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_NOT_INITIALIZED",
+            ("hipErrorNotInitialized", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInitializationError",
+            ("hipErrorInitializationError", CONV_TYPE, API_RUNTIME),
+        ),
+        ("CUDA_ERROR_DEINITIALIZED", ("hipErrorDeinitialized", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorCudartUnloading",
+            ("hipErrorDeinitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_DISABLED",
+            ("hipErrorProfilerDisabled", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerDisabled",
+            ("hipErrorProfilerDisabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_NOT_INITIALIZED",
+            ("hipErrorProfilerNotInitialized", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerNotInitialized",
+            ("hipErrorProfilerNotInitialized", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_ALREADY_STARTED",
+            ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerAlreadyStarted",
+            ("hipErrorProfilerAlreadyStarted", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PROFILER_ALREADY_STOPPED",
+            ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorProfilerAlreadyStopped",
+            ("hipErrorProfilerAlreadyStopped", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ERROR_NO_DEVICE", ("hipErrorNoDevice", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorNoDevice", ("hipErrorNoDevice", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_ERROR_INVALID_DEVICE", ("hipErrorInvalidDevice", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorInvalidDevice", ("hipErrorInvalidDevice", CONV_TYPE, API_RUNTIME)),
+        ("CUDA_ERROR_INVALID_IMAGE", ("hipErrorInvalidImage", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorInvalidKernelImage",
+            ("hipErrorInvalidImage", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ERROR_MAP_FAILED", ("hipErrorMapFailed", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorMapBufferObjectFailed",
+            ("hipErrorMapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_ERROR_UNMAP_FAILED", ("hipErrorUnmapFailed", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorUnmapBufferObjectFailed",
+            ("hipErrorUnmapFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NO_BINARY_FOR_GPU",
+            ("hipErrorNoBinaryForGpu", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorNoKernelImageForDevice",
+            ("hipErrorNoBinaryForGpu", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_ECC_UNCORRECTABLE",
+            ("hipErrorECCNotCorrectable", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorECCUncorrectable",
+            ("hipErrorECCNotCorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_UNSUPPORTED_LIMIT",
+            ("hipErrorUnsupportedLimit", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorUnsupportedLimit",
+            ("hipErrorUnsupportedLimit", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PEER_ACCESS_UNSUPPORTED",
+            ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorPeerAccessUnsupported",
+            ("hipErrorPeerAccessUnsupported", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_PTX",
+            ("hipErrorInvalidKernelFile", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInvalidPtx",
+            ("hipErrorInvalidKernelFile", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_GRAPHICS_CONTEXT",
+            ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInvalidGraphicsContext",
+            ("hipErrorInvalidGraphicsContext", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_NVLINK_UNCORRECTABLE",
+            ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorNvlinkUncorrectable",
+            ("hipErrorNvlinkUncorrectable", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND",
+            ("hipErrorSharedObjectSymbolNotFound", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorSharedObjectSymbolNotFound",
+            (
+                "hipErrorSharedObjectSymbolNotFound",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED",
+            ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorSharedObjectInitFailed",
+            ("hipErrorSharedObjectInitFailed", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_OPERATING_SYSTEM",
+            ("hipErrorOperatingSystem", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorOperatingSystem",
+            ("hipErrorOperatingSystem", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_HANDLE",
+            ("hipErrorInvalidResourceHandle", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorInvalidResourceHandle",
+            ("hipErrorInvalidResourceHandle", CONV_TYPE, API_RUNTIME),
+        ),
+        ("CUDA_ERROR_NOT_READY", ("hipErrorNotReady", CONV_TYPE, API_DRIVER)),
+        ("cudaErrorNotReady", ("hipErrorNotReady", CONV_TYPE, API_RUNTIME)),
+        (
+            "CUDA_ERROR_ILLEGAL_ADDRESS",
+            ("hipErrorIllegalAddress", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorIllegalAddress",
+            ("hipErrorIllegalAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES",
+            ("hipErrorLaunchOutOfResources", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorLaunchOutOfResources",
+            ("hipErrorLaunchOutOfResources", CONV_TYPE, API_RUNTIME),
+        ),
+        ("CUDA_ERROR_LAUNCH_TIMEOUT", ("hipErrorLaunchTimeOut", CONV_TYPE, API_DRIVER)),
+        (
+            "cudaErrorLaunchTimeout",
+            ("hipErrorLaunchTimeOut", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED",
+            ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorPeerAccessAlreadyEnabled",
+            ("hipErrorPeerAccessAlreadyEnabled", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED",
+            ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorPeerAccessNotEnabled",
+            ("hipErrorPeerAccessNotEnabled", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_ASSERT",
+            ("hipErrorAssert", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorAssert",
+            ("hipErrorAssert", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_TOO_MANY_PEERS",
+            ("hipErrorTooManyPeers", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorTooManyPeers",
+            ("hipErrorTooManyPeers", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED",
+            ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorHostMemoryAlreadyRegistered",
+            ("hipErrorHostMemoryAlreadyRegistered", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED",
+            ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "cudaErrorHostMemoryNotRegistered",
+            ("hipErrorHostMemoryNotRegistered", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "CUDA_ERROR_HARDWARE_STACK_ERROR",
+            ("hipErrorHardwareStackError", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorHardwareStackError",
+            ("hipErrorHardwareStackError", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_ILLEGAL_INSTRUCTION",
+            ("hipErrorIllegalInstruction", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorIllegalInstruction",
+            ("hipErrorIllegalInstruction", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_MISALIGNED_ADDRESS",
+            ("hipErrorMisalignedAddress", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorMisalignedAddress",
+            ("hipErrorMisalignedAddress", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_ADDRESS_SPACE",
+            ("hipErrorInvalidAddressSpace", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidAddressSpace",
+            ("hipErrorInvalidAddressSpace", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_INVALID_PC",
+            ("hipErrorInvalidPc", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorInvalidPc",
+            ("hipErrorInvalidPc", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_LAUNCH_FAILED",
+            ("hipErrorLaunchFailure", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaErrorLaunchFailure",
+            ("hipErrorLaunchFailure", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ERROR_UNKNOWN",
+            ("hipErrorUnknown", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cudaErrorUnknown", ("hipErrorUnknown", CONV_TYPE, API_RUNTIME)),
+        (
+            "CU_TR_ADDRESS_MODE_WRAP",
+            ("HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TR_ADDRESS_MODE_CLAMP",
+            ("HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TR_ADDRESS_MODE_MIRROR",
+            ("HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TR_ADDRESS_MODE_BORDER",
+            ("HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_POSITIVE_X",
+            ("HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_NEGATIVE_X",
+            ("HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_POSITIVE_Y",
+            ("HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_NEGATIVE_Y",
+            ("HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_POSITIVE_Z",
+            ("HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CUBEMAP_FACE_NEGATIVE_Z",
+            ("HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_AD_FORMAT_UNSIGNED_INT8",
+            ("HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_UNSIGNED_INT16",
+            ("HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_UNSIGNED_INT32",
+            ("HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_SIGNED_INT8",
+            ("HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_SIGNED_INT16",
+            ("HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_AD_FORMAT_SIGNED_INT32",
+            ("HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER),
+        ),
+        ("CU_AD_FORMAT_HALF", ("HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER)),
+        ("CU_AD_FORMAT_FLOAT", ("HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_COMPUTEMODE_DEFAULT",
+            ("hipComputeModeDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_COMPUTEMODE_EXCLUSIVE",
+            ("hipComputeModeExclusive", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_COMPUTEMODE_PROHIBITED",
+            ("hipComputeModeProhibited", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_COMPUTEMODE_EXCLUSIVE_PROCESS",
+            ("hipComputeModeExclusiveProcess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_SET_READ_MOSTLY",
+            ("hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_UNSET_READ_MOSTLY",
+            ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_SET_PREFERRED_LOCATION",
+            (
+                "hipMemAdviseSetPreferredLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION",
+            (
+                "hipMemAdviseUnsetPreferredLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_MEM_ADVISE_SET_ACCESSED_BY",
+            ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ADVISE_UNSET_ACCESSED_BY",
+            ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY",
+            ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION",
+            (
+                "hipMemRangeAttributePreferredLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY",
+            ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION",
+            (
+                "hipMemRangeAttributeLastPrefetchLocation",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_CTX_SCHED_AUTO",
+            ("HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_SPIN",
+            ("HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_YIELD",
+            ("HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_BLOCKING_SYNC",
+            ("HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_BLOCKING_SYNC",
+            ("HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_SCHED_MASK",
+            ("HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_MAP_HOST",
+            ("HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_LMEM_RESIZE_TO_MAX",
+            ("HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_CTX_FLAGS_MASK",
+            ("HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LAUNCH_PARAM_BUFFER_POINTER",
+            ("HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_LAUNCH_PARAM_BUFFER_SIZE",
+            ("HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_TYPE, API_DRIVER),
+        ),
+        ("CU_LAUNCH_PARAM_END", ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_IPC_HANDLE_SIZE",
+            ("HIP_LAUNCH_PARAM_END", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTALLOC_DEVICEMAP",
+            ("HIP_MEMHOSTALLOC_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTALLOC_PORTABLE",
+            ("HIP_MEMHOSTALLOC_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTALLOC_WRITECOMBINED",
+            ("HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTREGISTER_DEVICEMAP",
+            ("HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTREGISTER_IOMEMORY",
+            ("HIP_MEMHOSTREGISTER_IOMEMORY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMHOSTREGISTER_PORTABLE",
+            ("HIP_MEMHOSTREGISTER_PORTABLE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_PARAM_TR_DEFAULT",
+            ("HIP_PARAM_TR_DEFAULT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_LEGACY",
+            ("HIP_STREAM_LEGACY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_PER_THREAD",
+            ("HIP_STREAM_PER_THREAD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TRSA_OVERRIDE_FORMAT",
+            ("HIP_TRSA_OVERRIDE_FORMAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TRSF_NORMALIZED_COORDINATES",
+            ("HIP_TRSF_NORMALIZED_COORDINATES", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TRSF_READ_AS_INTEGER",
+            ("HIP_TRSF_READ_AS_INTEGER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_TRSF_SRGB", ("HIP_TRSF_SRGB", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CUDA_ARRAY3D_2DARRAY",
+            ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_CUBEMAP",
+            ("HIP_ARRAY3D_CUBEMAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_DEPTH_TEXTURE",
+            ("HIP_ARRAY3D_DEPTH_TEXTURE", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_LAYERED",
+            ("HIP_ARRAY3D_LAYERED", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_SURFACE_LDST",
+            ("HIP_ARRAY3D_SURFACE_LDST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUDA_ARRAY3D_TEXTURE_GATHER",
+            ("HIP_ARRAY3D_TEXTURE_GATHER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxThreadsPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X",
+            ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y",
+            ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z",
+            ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X",
+            ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y",
+            ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z",
+            ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY",
+            (
+                "hipDeviceAttributeTotalConstantMemory",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_WARP_SIZE",
+            ("hipDeviceAttributeWarpSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_PITCH",
+            ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxRegistersPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK",
+            (
+                "hipDeviceAttributeMaxRegistersPerBlock",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CLOCK_RATE",
+            ("hipDeviceAttributeClockRate", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT",
+            (
+                "hipDeviceAttributeTextureAlignment",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_GPU_OVERLAP",
+            (
+                "hipDeviceAttributeAsyncEngineCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT",
+            (
+                "hipDeviceAttributeMultiprocessorCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT",
+            (
+                "hipDeviceAttributeKernelExecTimeout",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_INTEGRATED",
+            ("hipDeviceAttributeIntegrated", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY",
+            (
+                "hipDeviceAttributeCanMapHostMemory",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_MODE",
+            ("hipDeviceAttributeComputeMode", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture3DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture3DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH",
+            (
+                "hipDeviceAttributeMaxTexture3DDepth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT",
+            (
+                "hipDeviceAttributeSurfaceAlignment",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS",
+            ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_ECC_ENABLED",
+            ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PCI_BUS_ID",
+            ("hipDeviceAttributePciBusId", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID",
+            ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TCC_DRIVER",
+            ("hipDeviceAttributeTccDriver", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE",
+            (
+                "hipDeviceAttributeMemoryClockRate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH",
+            ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE",
+            ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR",
+            ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT",
+            (
+                "hipDeviceAttributeAsyncEngineCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING",
+            (
+                "hipDeviceAttributeUnifiedAddressing",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER",
+            (
+                "hipDeviceAttributeCanTex2DGather",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE",
+            (
+                "hipDeviceAttributeMaxTexture3DWidthAlternate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE",
+            (
+                "hipDeviceAttributeMaxTexture3DHeightAlternate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE",
+            (
+                "hipDeviceAttributeMaxTexture3DDepthAlternate",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID",
+            ("hipDeviceAttributePciDomainId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT",
+            (
+                "hipDeviceAttributeTexturePitchAlignment",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH",
+            (
+                "hipDeviceAttributeMaxTextureCubemapWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface1DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface2DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxSurface2DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface3DWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT",
+            (
+                "hipDeviceAttributeMaxSurface3DHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH",
+            (
+                "hipDeviceAttributeMaxSurface3DDepth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredLayers",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DLinearWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearPitch",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedHeight",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR",
+            ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR",
+            ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH",
+            (
+                "hipDeviceAttributeMaxTexture1DMipmappedWidth",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED",
+            (
+                "hipDeviceAttributeStreamPrioritiesSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED",
+            (
+                "hipDeviceAttributeGlobalL1CacheSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED",
+            (
+                "hipDeviceAttributeLocalL1CacheSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor",
+                CONV_TYPE,
+                API_DRIVER,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR",
+            (
+                "hipDeviceAttributeMaxRegistersPerMultiprocessor",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY",
+            ("hipDeviceAttributeManagedMemory", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD",
+            ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID",
+            (
+                "hipDeviceAttributeMultiGpuBoardGroupId",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED",
+            (
+                "hipDeviceAttributeHostNativeAtomicSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO",
+            (
+                "hipDeviceAttributeSingleToDoublePrecisionPerfRatio",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS",
+            (
+                "hipDeviceAttributePageableMemoryAccess",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS",
+            (
+                "hipDeviceAttributeConcurrentManagedAccess",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED",
+            (
+                "hipDeviceAttributeComputePreemptionSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM",
+            (
+                "hipDeviceAttributeCanUseHostPointerForRegisteredMem",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_ATTRIBUTE_MAX",
+            ("hipDeviceAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_CONTEXT",
+            ("hipPointerAttributeContext", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_MEMORY_TYPE",
+            ("hipPointerAttributeMemoryType", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_DEVICE_POINTER",
+            (
+                "hipPointerAttributeDevicePointer",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_HOST_POINTER",
+            ("hipPointerAttributeHostPointer", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_P2P_TOKENS",
+            ("hipPointerAttributeP2pTokens", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_SYNC_MEMOPS",
+            ("hipPointerAttributeSyncMemops", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_BUFFER_ID",
+            ("hipPointerAttributeBufferId", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_POINTER_ATTRIBUTE_IS_MANAGED",
+            ("hipPointerAttributeIsManaged", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK",
+            (
+                "hipFuncAttributeMaxThreadsPerBlocks",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES",
+            ("hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES",
+            ("hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES",
+            ("hipFuncAttributeLocalSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_NUM_REGS",
+            ("hipFuncAttributeNumRegs", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_PTX_VERSION",
+            ("hipFuncAttributePtxVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_BINARY_VERSION",
+            ("hipFuncAttributeBinaryVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_CACHE_MODE_CA",
+            ("hipFuncAttributeCacheModeCA", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_FUNC_ATTRIBUTE_MAX",
+            ("hipFuncAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE",
+            ("hipGraphicsMapFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY",
+            ("hipGraphicsMapFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+            ("hipGraphicsMapFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_NONE",
+            ("hipGraphicsRegisterFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY",
+            (
+                "hipGraphicsRegisterFlagsReadOnly",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD",
+            (
+                "hipGraphicsRegisterFlagsWriteDiscard",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST",
+            (
+                "hipGraphicsRegisterFlagsSurfaceLoadStore",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER",
+            (
+                "hipGraphicsRegisterFlagsTextureGather",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_OCCUPANCY_DEFAULT",
+            ("hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE",
+            (
+                "hipOccupancyDisableCachingOverride",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_FUNC_CACHE_PREFER_NONE",
+            ("hipFuncCachePreferNone", CONV_CACHE, API_DRIVER),
+        ),
+        (
+            "CU_FUNC_CACHE_PREFER_SHARED",
+            ("hipFuncCachePreferShared", CONV_CACHE, API_DRIVER),
+        ),
+        ("CU_FUNC_CACHE_PREFER_L1", ("hipFuncCachePreferL1", CONV_CACHE, API_DRIVER)),
+        (
+            "CU_FUNC_CACHE_PREFER_EQUAL",
+            ("hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER),
+        ),
+        (
+            "CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS",
+            ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CUDA_IPC_HANDLE_SIZE", ("HIP_IPC_HANDLE_SIZE", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_JIT_CACHE_OPTION_NONE",
+            ("hipJitCacheModeOptionNone", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_CACHE_OPTION_CG",
+            ("hipJitCacheModeOptionCG", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_CACHE_OPTION_CA",
+            ("hipJitCacheModeOptionCA", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_PREFER_PTX",
+            ("hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_PREFER_BINARY",
+            ("hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_JIT_MAX_REGISTERS", ("hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_THREADS_PER_BLOCK",
+            ("hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_WALL_TIME", ("hipJitOptionWallTime", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_INFO_LOG_BUFFER", ("hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES",
+            ("hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_ERROR_LOG_BUFFER",
+            ("hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES",
+            ("hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_OPTIMIZATION_LEVEL",
+            ("hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_TARGET_FROM_CUCONTEXT",
+            ("hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_TARGET", ("hipJitOptionTarget", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_FALLBACK_STRATEGY",
+            ("hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER),
+        ),
+        (
+            "CU_JIT_GENERATE_DEBUG_INFO",
+            ("hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_LOG_VERBOSE", ("hipJitOptionLogVerbose", CONV_JIT, API_DRIVER)),
+        (
+            "CU_JIT_GENERATE_LINE_INFO",
+            ("hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER),
+        ),
+        ("CU_JIT_CACHE_MODE", ("hipJitOptionCacheMode", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_NEW_SM3X_OPT", ("hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_FAST_COMPILE", ("hipJitOptionFastCompile", CONV_JIT, API_DRIVER)),
+        ("CU_JIT_NUM_OPTIONS", ("hipJitOptionNumOptions", CONV_JIT, API_DRIVER)),
+        (
+            "CU_TARGET_COMPUTE_10",
+            ("hipJitTargetCompute10", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_11",
+            ("hipJitTargetCompute11", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_12",
+            ("hipJitTargetCompute12", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_13",
+            ("hipJitTargetCompute13", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_20",
+            ("hipJitTargetCompute20", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_21",
+            ("hipJitTargetCompute21", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_30",
+            ("hipJitTargetCompute30", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_32",
+            ("hipJitTargetCompute32", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_35",
+            ("hipJitTargetCompute35", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_37",
+            ("hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_50",
+            ("hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_52",
+            ("hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_53",
+            ("hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_60",
+            ("hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_61",
+            ("hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_TARGET_COMPUTE_62",
+            ("hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_CUBIN",
+            ("hipJitInputTypeBin", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_PTX",
+            ("hipJitInputTypePtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_FATBINARY",
+            ("hipJitInputTypeFatBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_OBJECT",
+            ("hipJitInputTypeObject", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_INPUT_LIBRARY",
+            ("hipJitInputTypeLibrary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_JIT_NUM_INPUT_TYPES",
+            ("hipJitInputTypeNumInputTypes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_STACK_SIZE",
+            ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_PRINTF_FIFO_SIZE",
+            ("hipLimitPrintfFifoSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_MALLOC_HEAP_SIZE",
+            ("hipLimitMallocHeapSize", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH",
+            ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT",
+            (
+                "hipLimitDevRuntimePendingLaunchCount",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_LIMIT_STACK_SIZE",
+            ("hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ATTACH_GLOBAL",
+            ("hipMemAttachGlobal", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ATTACH_HOST",
+            ("hipMemAttachHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEM_ATTACH_SINGLE",
+            ("hipMemAttachSingle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_HOST",
+            ("hipMemTypeHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_DEVICE",
+            ("hipMemTypeDevice", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_ARRAY",
+            ("hipMemTypeArray", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_MEMORYTYPE_UNIFIED",
+            ("hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_ARRAY",
+            ("hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_MIPMAPPED_ARRAY",
+            ("hipResourceTypeMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_LINEAR",
+            ("hipResourceTypeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_RESOURCE_TYPE_PITCH2D",
+            ("hipResourceTypePitch2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_RES_VIEW_FORMAT_NONE", ("hipResViewFormatNone", CONV_TEX, API_DRIVER)),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_1X8",
+            ("hipResViewFormatUnsignedChar1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_2X8",
+            ("hipResViewFormatUnsignedChar2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_4X8",
+            ("hipResViewFormatUnsignedChar4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_1X8",
+            ("hipResViewFormatSignedChar1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_2X8",
+            ("hipResViewFormatSignedChar2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_4X8",
+            ("hipResViewFormatSignedChar4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_1X16",
+            ("hipResViewFormatUnsignedShort1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_2X16",
+            ("hipResViewFormatUnsignedShort2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_4X16",
+            ("hipResViewFormatUnsignedShort4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_1X16",
+            ("hipResViewFormatSignedShort1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_2X16",
+            ("hipResViewFormatSignedShort2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_4X16",
+            ("hipResViewFormatSignedShort4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_1X32",
+            ("hipResViewFormatUnsignedInt1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_2X32",
+            ("hipResViewFormatUnsignedInt2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UINT_4X32",
+            ("hipResViewFormatUnsignedInt4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_1X32",
+            ("hipResViewFormatSignedInt1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_2X32",
+            ("hipResViewFormatSignedInt2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SINT_4X32",
+            ("hipResViewFormatSignedInt4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_1X16",
+            ("hipResViewFormatHalf1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_2X16",
+            ("hipResViewFormatHalf2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_4X16",
+            ("hipResViewFormatHalf4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_1X32",
+            ("hipResViewFormatFloat1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_2X32",
+            ("hipResViewFormatFloat2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_FLOAT_4X32",
+            ("hipResViewFormatFloat4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC1",
+            ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC2",
+            ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC3",
+            ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC4",
+            ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SIGNED_BC4",
+            ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC5",
+            ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SIGNED_BC5",
+            ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC6H",
+            ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_SIGNED_BC6H",
+            ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_RES_VIEW_FORMAT_UNSIGNED_BC7",
+            ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER),
+        ),
+        (
+            "CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE",
+            ("hipSharedMemBankSizeDefault", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE",
+            ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_DRIVER),
+        ),
+        (
+            "CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE",
+            ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_DRIVER),
+        ),
+        ("CU_STREAM_DEFAULT", ("hipStreamDefault", CONV_TYPE, API_DRIVER)),
+        ("CU_STREAM_NON_BLOCKING", ("hipStreamNonBlocking", CONV_TYPE, API_DRIVER)),
+        (
+            "CU_STREAM_WAIT_VALUE_GEQ",
+            ("hipStreamWaitValueGeq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WAIT_VALUE_EQ",
+            ("hipStreamWaitValueEq", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WAIT_VALUE_AND",
+            ("hipStreamWaitValueAnd", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WAIT_VALUE_FLUSH",
+            ("hipStreamWaitValueFlush", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WRITE_VALUE_DEFAULT",
+            ("hipStreamWriteValueDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER",
+            (
+                "hipStreamWriteValueNoMemoryBarrier",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_STREAM_MEM_OP_WAIT_VALUE_32",
+            ("hipStreamBatchMemOpWaitValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_MEM_OP_WRITE_VALUE_32",
+            ("hipStreamBatchMemOpWriteValue32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES",
+            (
+                "hipStreamBatchMemOpFlushRemoteWrites",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGetErrorName",
+            ("hipGetErrorName___", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGetErrorString",
+            ("hipGetErrorString___", CONV_ERROR, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuInit", ("hipInit", CONV_INIT, API_DRIVER)),
+        ("cuDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_DRIVER)),
+        ("cuCtxCreate_v2", ("hipCtxCreate", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxDestroy_v2", ("hipCtxDestroy", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetApiVersion", ("hipCtxGetApiVersion", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetCacheConfig", ("hipCtxGetCacheConfig", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetCurrent", ("hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetDevice", ("hipCtxGetDevice", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxGetFlags", ("hipCtxGetFlags", CONV_CONTEXT, API_DRIVER)),
+        (
+            "cuCtxGetLimit",
+            ("hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuCtxGetSharedMemConfig",
+            ("hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuCtxGetStreamPriorityRange",
+            ("hipCtxGetStreamPriorityRange", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuCtxPopCurrent_v2", ("hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxPushCurrent_v2", ("hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxSetCacheConfig", ("hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxSetCurrent", ("hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER)),
+        (
+            "cuCtxSetLimit",
+            ("hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuCtxSetSharedMemConfig",
+            ("hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER),
+        ),
+        ("cuCtxSynchronize", ("hipCtxSynchronize", CONV_CONTEXT, API_DRIVER)),
+        ("cuCtxAttach", ("hipCtxAttach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuCtxDetach", ("hipCtxDetach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuCtxEnablePeerAccess", ("hipCtxEnablePeerAccess", CONV_PEER, API_DRIVER)),
+        ("cuCtxDisablePeerAccess", ("hipCtxDisablePeerAccess", CONV_PEER, API_DRIVER)),
+        ("cuDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_DRIVER)),
+        (
+            "cuDeviceGetP2PAttribute",
+            ("hipDeviceGetP2PAttribute", CONV_PEER, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuDevicePrimaryCtxGetState",
+            ("hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxRelease",
+            ("hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxReset",
+            ("hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxRetain",
+            ("hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER),
+        ),
+        (
+            "cuDevicePrimaryCtxSetFlags",
+            ("hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER),
+        ),
+        ("cuDeviceGet", ("hipGetDevice", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetName", ("hipDeviceGetName", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetCount", ("hipGetDeviceCount", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceGetByPCIBusId", ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_DRIVER)),
+        ("cuDeviceTotalMem_v2", ("hipDeviceTotalMem", CONV_DEVICE, API_DRIVER)),
+        (
+            "cuDeviceComputeCapability",
+            ("hipDeviceComputeCapability", CONV_DEVICE, API_DRIVER),
+        ),
+        ("cuDeviceGetProperties", ("hipGetDeviceProperties", CONV_DEVICE, API_DRIVER)),
+        ("cuLinkAddData", ("hipLinkAddData", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuLinkAddFile", ("hipLinkAddFile", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuLinkComplete",
+            ("hipLinkComplete", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuLinkCreate", ("hipLinkCreate", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuLinkDestroy", ("hipLinkDestroy", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuModuleGetFunction", ("hipModuleGetFunction", CONV_MODULE, API_DRIVER)),
+        ("cuModuleGetGlobal_v2", ("hipModuleGetGlobal", CONV_MODULE, API_DRIVER)),
+        (
+            "cuModuleGetSurfRef",
+            ("hipModuleGetSurfRef", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuModuleGetTexRef", ("hipModuleGetTexRef", CONV_MODULE, API_DRIVER)),
+        ("cuModuleLoad", ("hipModuleLoad", CONV_MODULE, API_DRIVER)),
+        ("cuModuleLoadData", ("hipModuleLoadData", CONV_MODULE, API_DRIVER)),
+        ("cuModuleLoadDataEx", ("hipModuleLoadDataEx", CONV_MODULE, API_DRIVER)),
+        (
+            "cuModuleLoadFatBinary",
+            ("hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuModuleUnload", ("hipModuleUnload", CONV_MODULE, API_DRIVER)),
+        (
+            "CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK",
+            (
+                "hipDeviceP2PAttributePerformanceRank",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED",
+            (
+                "hipDeviceP2PAttributeAccessSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED",
+            (
+                "hipDeviceP2PAttributeNativeAtomicSupported",
+                CONV_TYPE,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("CU_EVENT_DEFAULT", ("hipEventDefault", CONV_EVENT, API_DRIVER)),
+        ("CU_EVENT_BLOCKING_SYNC", ("hipEventBlockingSync", CONV_EVENT, API_DRIVER)),
+        ("CU_EVENT_DISABLE_TIMING", ("hipEventDisableTiming", CONV_EVENT, API_DRIVER)),
+        ("CU_EVENT_INTERPROCESS", ("hipEventInterprocess", CONV_EVENT, API_DRIVER)),
+        ("cuEventCreate", ("hipEventCreate", CONV_EVENT, API_DRIVER)),
+        ("cuEventDestroy_v2", ("hipEventDestroy", CONV_EVENT, API_DRIVER)),
+        ("cuEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_DRIVER)),
+        ("cuEventQuery", ("hipEventQuery", CONV_EVENT, API_DRIVER)),
+        ("cuEventRecord", ("hipEventRecord", CONV_EVENT, API_DRIVER)),
+        ("cuEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_DRIVER)),
+        (
+            "cuFuncGetAttribute",
+            ("hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER)),
+        (
+            "cuFuncSetSharedMemConfig",
+            ("hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuLaunchKernel", ("hipModuleLaunchKernel", CONV_MODULE, API_DRIVER)),
+        (
+            "cuFuncSetBlockShape",
+            ("hipFuncSetBlockShape", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuFuncSetSharedSize",
+            ("hipFuncSetSharedSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuLaunch", ("hipLaunch", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuLaunchGrid", ("hipLaunchGrid", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuLaunchGridAsync",
+            ("hipLaunchGridAsync", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuParamSetf", ("hipParamSetf", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuParamSeti", ("hipParamSeti", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuParamSetSize",
+            ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuParamSetSize",
+            ("hipParamSetSize", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuParamSetv", ("hipParamSetv", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuOccupancyMaxActiveBlocksPerMultiprocessor",
+            (
+                "hipOccupancyMaxActiveBlocksPerMultiprocessor",
+                CONV_OCCUPANCY,
+                API_DRIVER,
+            ),
+        ),
+        (
+            "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+            (
+                "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+                CONV_OCCUPANCY,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuOccupancyMaxPotentialBlockSize",
+            ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_DRIVER),
+        ),
+        (
+            "cuOccupancyMaxPotentialBlockSizeWithFlags",
+            (
+                "hipOccupancyMaxPotentialBlockSizeWithFlags",
+                CONV_OCCUPANCY,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cuStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_DRIVER)),
+        (
+            "cuStreamAttachMemAsync",
+            ("hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamCreate",
+            ("hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamCreateWithPriority",
+            ("hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuStreamDestroy_v2", ("hipStreamDestroy", CONV_STREAM, API_DRIVER)),
+        ("cuStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_DRIVER)),
+        (
+            "cuStreamGetPriority",
+            ("hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuStreamQuery", ("hipStreamQuery", CONV_STREAM, API_DRIVER)),
+        ("cuStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_DRIVER)),
+        ("cuStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_DRIVER)),
+        (
+            "cuStreamWaitValue32",
+            ("hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamWriteValue32",
+            ("hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuStreamBatchMemOp",
+            ("hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuArray3DCreate", ("hipArray3DCreate", CONV_MEM, API_DRIVER)),
+        (
+            "cuArray3DGetDescriptor",
+            ("hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuArrayCreate", ("hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuArrayDestroy", ("hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuArrayGetDescriptor",
+            ("hipArrayGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcCloseMemHandle",
+            ("hipIpcCloseMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcGetEventHandle",
+            ("hipIpcGetEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcGetMemHandle",
+            ("hipIpcGetMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcOpenEventHandle",
+            ("hipIpcOpenEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuIpcOpenMemHandle",
+            ("hipIpcOpenMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemAlloc_v2", ("hipMalloc", CONV_MEM, API_DRIVER)),
+        ("cuMemAllocHost", ("hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemAllocManaged",
+            ("hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemAllocPitch",
+            ("hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpy", ("hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpy2D", ("hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpy2DAsync",
+            ("hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemcpy2DUnaligned",
+            ("hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpy3D", ("hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpy3DAsync",
+            ("hipMemcpy3DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemcpy3DPeer",
+            ("hipMemcpy3DPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemcpy3DPeerAsync",
+            ("hipMemcpy3DPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyAsync", ("hipMemcpyAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyAtoA", ("hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyAtoD", ("hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyAtoH", ("hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpyAtoHAsync",
+            ("hipMemcpyAtoHAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyDtoA", ("hipMemcpyDtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemcpyDtoD_v2", ("hipMemcpyDtoD", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyDtoDAsync_v2", ("hipMemcpyDtoDAsync", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyDtoH_v2", ("hipMemcpyDtoH", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyDtoHAsync_v2", ("hipMemcpyDtoHAsync", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyHtoA", ("hipMemcpyHtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemcpyHtoAAsync",
+            ("hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyHtoD_v2", ("hipMemcpyHtoD", CONV_MEM, API_DRIVER)),
+        ("cuMemcpyHtoDAsync_v2", ("hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER)),
+        (
+            "cuMemcpyPeerAsync",
+            ("hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemcpyPeer", ("hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuMemFree_v2", ("hipFree", CONV_MEM, API_DRIVER)),
+        ("cuMemFreeHost", ("hipHostFree", CONV_MEM, API_DRIVER)),
+        (
+            "cuMemGetAddressRange",
+            ("hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemGetInfo_v2", ("hipMemGetInfo", CONV_MEM, API_DRIVER)),
+        ("cuMemHostAlloc", ("hipHostMalloc", CONV_MEM, API_DRIVER)),
+        (
+            "cuMemHostGetDevicePointer",
+            ("hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemHostGetFlags",
+            ("hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemHostRegister_v2", ("hipHostRegister", CONV_MEM, API_DRIVER)),
+        ("cuMemHostUnregister", ("hipHostUnregister", CONV_MEM, API_DRIVER)),
+        ("cuMemsetD16_v2", ("hipMemsetD16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD16Async",
+            ("hipMemsetD16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD2D16_v2", ("hipMemsetD2D16", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD2D16Async",
+            ("hipMemsetD2D16Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD2D32_v2", ("hipMemsetD2D32", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD2D32Async",
+            ("hipMemsetD2D32Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD2D8_v2", ("hipMemsetD2D8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD2D8Async",
+            ("hipMemsetD2D8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemsetD32_v2", ("hipMemset", CONV_MEM, API_DRIVER)),
+        ("cuMemsetD32Async", ("hipMemsetAsync", CONV_MEM, API_DRIVER)),
+        ("cuMemsetD8_v2", ("hipMemsetD8", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemsetD8Async",
+            ("hipMemsetD8Async", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMipmappedArrayCreate",
+            ("hipMipmappedArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMipmappedArrayDestroy",
+            ("hipMipmappedArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMipmappedArrayGetLevel",
+            ("hipMipmappedArrayGetLevel", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemPrefetchAsync",
+            ("hipMemPrefetchAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuMemAdvise", ("hipMemAdvise", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuMemRangeGetAttribute",
+            ("hipMemRangeGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuMemRangeGetAttributes",
+            ("hipMemRangeGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuPointerGetAttribute",
+            ("hipPointerGetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuPointerGetAttributes",
+            ("hipPointerGetAttributes", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuPointerSetAttribute",
+            ("hipPointerSetAttribute", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("CU_TR_FILTER_MODE_POINT", ("hipFilterModePoint", CONV_TEX, API_DRIVER)),
+        (
+            "CU_TR_FILTER_MODE_LINEAR",
+            ("hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetAddress",
+            ("hipTexRefGetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetAddressMode",
+            ("hipTexRefGetAddressMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetArray",
+            ("hipTexRefGetArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetBorderColor",
+            ("hipTexRefGetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetFilterMode",
+            ("hipTexRefGetFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetFlags",
+            ("hipTexRefGetFlags", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetFormat",
+            ("hipTexRefGetFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMaxAnisotropy",
+            ("hipTexRefGetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmapFilterMode",
+            ("hipTexRefGetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmapLevelBias",
+            ("hipTexRefGetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmapLevelClamp",
+            ("hipTexRefGetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefGetMipmappedArray",
+            ("hipTexRefGetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetAddress",
+            ("hipTexRefSetAddress", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetAddress2D",
+            ("hipTexRefSetAddress2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuTexRefSetAddressMode", ("hipTexRefSetAddressMode", CONV_TEX, API_DRIVER)),
+        ("cuTexRefSetArray", ("hipTexRefSetArray", CONV_TEX, API_DRIVER)),
+        (
+            "cuTexRefSetBorderColor",
+            ("hipTexRefSetBorderColor", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuTexRefSetFilterMode", ("hipTexRefSetFilterMode", CONV_TEX, API_DRIVER)),
+        ("cuTexRefSetFlags", ("hipTexRefSetFlags", CONV_TEX, API_DRIVER)),
+        ("cuTexRefSetFormat", ("hipTexRefSetFormat", CONV_TEX, API_DRIVER)),
+        (
+            "cuTexRefSetMaxAnisotropy",
+            ("hipTexRefSetMaxAnisotropy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmapFilterMode",
+            ("hipTexRefSetMipmapFilterMode", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmapLevelBias",
+            ("hipTexRefSetMipmapLevelBias", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmapLevelClamp",
+            ("hipTexRefSetMipmapLevelClamp", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexRefSetMipmappedArray",
+            ("hipTexRefSetMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuTexRefCreate", ("hipTexRefCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuTexRefDestroy",
+            ("hipTexRefDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfRefGetArray",
+            ("hipSurfRefGetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfRefSetArray",
+            ("hipSurfRefSetArray", CONV_SURFACE, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectCreate",
+            ("hipTexObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectDestroy",
+            ("hipTexObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectGetResourceDesc",
+            ("hipTexObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectGetResourceViewDesc",
+            ("hipTexObjectGetResourceViewDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuTexObjectGetTextureDesc",
+            ("hipTexObjectGetTextureDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfObjectCreate",
+            ("hipSurfObjectCreate", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfObjectDestroy",
+            ("hipSurfObjectDestroy", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuSurfObjectGetResourceDesc",
+            ("hipSurfObjectGetResourceDesc", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsMapResources",
+            ("hipGraphicsMapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsResourceGetMappedMipmappedArray",
+            (
+                "hipGraphicsResourceGetMappedMipmappedArray",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsResourceGetMappedPointer",
+            (
+                "hipGraphicsResourceGetMappedPointer",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsResourceSetMapFlags",
+            (
+                "hipGraphicsResourceSetMapFlags",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsSubResourceGetMappedArray",
+            (
+                "hipGraphicsSubResourceGetMappedArray",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsUnmapResources",
+            ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsUnregisterResource",
+            (
+                "hipGraphicsUnregisterResource",
+                CONV_GRAPHICS,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuProfilerInitialize",
+            ("hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuProfilerStart", ("hipProfilerStart", CONV_OTHER, API_DRIVER)),
+        ("cuProfilerStop", ("hipProfilerStop", CONV_OTHER, API_DRIVER)),
+        (
+            "CU_GL_DEVICE_LIST_ALL",
+            ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GL_DEVICE_LIST_CURRENT_FRAME",
+            ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GL_DEVICE_LIST_NEXT_FRAME",
+            ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuGLGetDevices", ("hipGLGetDevices", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuGraphicsGLRegisterBuffer",
+            ("hipGraphicsGLRegisterBuffer", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsGLRegisterImage",
+            ("hipGraphicsGLRegisterImage", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        ("cuWGLGetDevice", ("hipWGLGetDevice", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "CU_GL_MAP_RESOURCE_FLAGS_NONE",
+            ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY",
+                CONV_GL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+                CONV_GL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cuGLCtxCreate", ("hipGLCtxCreate", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        ("cuGLInit", ("hipGLInit", CONV_GL, API_DRIVER, HIP_UNSUPPORTED)),
+        (
+            "cuGLMapBufferObject",
+            ("hipGLMapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLMapBufferObjectAsync",
+            ("hipGLMapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLRegisterBufferObject",
+            ("hipGLRegisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLSetBufferObjectMapFlags",
+            ("hipGLSetBufferObjectMapFlags", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLUnmapBufferObject",
+            ("hipGLUnmapBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLUnmapBufferObjectAsync",
+            ("hipGLUnmapBufferObjectAsync", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGLUnregisterBufferObject",
+            ("hipGLUnregisterBufferObject", CONV_GL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_DEVICE_LIST_ALL",
+            ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_DEVICE_LIST_CURRENT_FRAME",
+            (
+                "HIP_D3D9_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D9_DEVICE_LIST_NEXT_FRAME",
+            ("HIP_D3D9_DEVICE_LIST_NEXT_FRAME", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9CtxCreate",
+            ("hipD3D9CtxCreate", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9CtxCreateOnDevice",
+            ("hipD3D9CtxCreateOnDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9GetDevice",
+            ("hipD3D9GetDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9GetDevices",
+            ("hipD3D9GetDevices", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9GetDirect3DDevice",
+            ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsD3D9RegisterResource",
+            ("hipGraphicsD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_MAPRESOURCE_FLAGS_NONE",
+            ("HIP_D3D9_MAPRESOURCE_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_MAPRESOURCE_FLAGS_READONLY",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D9_REGISTER_FLAGS_NONE",
+            ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D9_REGISTER_FLAGS_ARRAY",
+            ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9MapResources",
+            ("hipD3D9MapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9RegisterResource",
+            ("hipD3D9RegisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedArray",
+            ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedPitch",
+            ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedPointer",
+            ("hipD3D9ResourceGetMappedPointer", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetMappedSize",
+            ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9ResourceGetSurfaceDimensions",
+            (
+                "hipD3D9ResourceGetSurfaceDimensions",
+                CONV_D3D9,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D9ResourceSetMapFlags",
+            ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9UnmapResources",
+            ("hipD3D9UnmapResources", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D9UnregisterResource",
+            ("hipD3D9UnregisterResource", CONV_D3D9, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D10_DEVICE_LIST_ALL",
+            ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D10_DEVICE_LIST_CURRENT_FRAME",
+            (
+                "HIP_D3D10_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_DEVICE_LIST_NEXT_FRAME",
+            (
+                "HIP_D3D10_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D10GetDevice",
+            ("hipD3D10GetDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10GetDevices",
+            ("hipD3D10GetDevices", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsD3D10RegisterResource",
+            (
+                "hipGraphicsD3D10RegisterResource",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_MAPRESOURCE_FLAGS_NONE",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_NONE",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_MAPRESOURCE_FLAGS_READONLY",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D10_REGISTER_FLAGS_NONE",
+            ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D10_REGISTER_FLAGS_ARRAY",
+            ("HIP_D3D10_REGISTER_FLAGS_ARRAY", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10CtxCreate",
+            ("hipD3D10CtxCreate", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10CtxCreateOnDevice",
+            ("hipD3D10CtxCreateOnDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10GetDirect3DDevice",
+            ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10MapResources",
+            ("hipD3D10MapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10RegisterResource",
+            ("hipD3D10RegisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetMappedArray",
+            ("hipD3D10ResourceGetMappedArray", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetMappedPitch",
+            ("hipD3D10ResourceGetMappedPitch", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetMappedPointer",
+            (
+                "hipD3D10ResourceGetMappedPointer",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D10ResourceGetMappedSize",
+            ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10ResourceGetSurfaceDimensions",
+            (
+                "hipD3D10ResourceGetSurfaceDimensions",
+                CONV_D3D10,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD310ResourceSetMapFlags",
+            ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10UnmapResources",
+            ("hipD3D10UnmapResources", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D10UnregisterResource",
+            ("hipD3D10UnregisterResource", CONV_D3D10, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D11_DEVICE_LIST_ALL",
+            ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "CU_D3D11_DEVICE_LIST_CURRENT_FRAME",
+            (
+                "HIP_D3D11_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D11,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CU_D3D11_DEVICE_LIST_NEXT_FRAME",
+            (
+                "HIP_D3D11_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D11,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D11GetDevice",
+            ("hipD3D11GetDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D11GetDevices",
+            ("hipD3D11GetDevices", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsD3D11RegisterResource",
+            (
+                "hipGraphicsD3D11RegisterResource",
+                CONV_D3D11,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuD3D11CtxCreate",
+            ("hipD3D11CtxCreate", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D11CtxCreateOnDevice",
+            ("hipD3D11CtxCreateOnDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuD3D11GetDirect3DDevice",
+            ("hipD3D11GetDirect3DDevice", CONV_D3D11, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsVDPAURegisterOutputSurface",
+            (
+                "hipGraphicsVDPAURegisterOutputSurface",
+                CONV_VDPAU,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuGraphicsVDPAURegisterVideoSurface",
+            (
+                "hipGraphicsVDPAURegisterVideoSurface",
+                CONV_VDPAU,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuVDPAUGetDevice",
+            ("hipVDPAUGetDevice", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuVDPAUCtxCreate",
+            ("hipVDPAUCtxCreate", CONV_VDPAU, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerAcquireFrame",
+            ("hipEGLStreamConsumerAcquireFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerConnect",
+            ("hipEGLStreamConsumerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerConnectWithFlags",
+            (
+                "hipEGLStreamConsumerConnectWithFlags",
+                CONV_EGL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cuEGLStreamConsumerDisconnect",
+            ("hipEGLStreamConsumerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamConsumerReleaseFrame",
+            ("hipEGLStreamConsumerReleaseFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerConnect",
+            ("hipEGLStreamProducerConnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerDisconnect",
+            ("hipEGLStreamProducerDisconnect", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerPresentFrame",
+            ("hipEGLStreamProducerPresentFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuEGLStreamProducerReturnFrame",
+            ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsEGLRegisterImage",
+            ("hipGraphicsEGLRegisterImage", CONV_EGL, API_DRIVER, HIP_UNSUPPORTED),
+        ),
+        (
+            "cuGraphicsResourceGetMappedEglFrame",
+            (
+                "hipGraphicsResourceGetMappedEglFrame",
+                CONV_EGL,
+                API_DRIVER,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaDataType_t", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaDataType", ("hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_16F", ("hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_16F", ("hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32F", ("hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32F", ("hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_64F", ("hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_64F", ("hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8I", ("hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8I", ("hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_8U", ("hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_8U", ("hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32I", ("hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32I", ("hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_R_32U", ("hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("CUDA_C_32U", ("hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "MAJOR_VERSION",
+            ("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "MINOR_VERSION",
+            ("hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "PATCH_LEVEL",
+            ("hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAttachGlobal",
+            ("hipMemAttachGlobal", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAttachHost",
+            ("hipMemAttachHost", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAttachSingle",
+            ("hipMemAttachSingle", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaOccupancyDefault",
+            ("hipOccupancyDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaOccupancyDisableCachingOverride",
+            (
+                "hipOccupancyDisableCachingOverride",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaGetLastError", ("hipGetLastError", CONV_ERROR, API_RUNTIME)),
+        ("cudaPeekAtLastError", ("hipPeekAtLastError", CONV_ERROR, API_RUNTIME)),
+        ("cudaGetErrorName", ("hipGetErrorName", CONV_ERROR, API_RUNTIME)),
+        ("cudaGetErrorString", ("hipGetErrorString", CONV_ERROR, API_RUNTIME)),
+        ("cudaMemcpy3DParms", ("hipMemcpy3DParms", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpy3DPeerParms",
+            ("hipMemcpy3DPeerParms", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpy", ("hipMemcpy", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyToArray", ("hipMemcpyToArray", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyToSymbol", ("hipMemcpyToSymbol", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyToSymbolAsync", ("hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyAsync", ("hipMemcpyAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpy2D", ("hipMemcpy2D", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpy2DAsync", ("hipMemcpy2DAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpy2DToArray", ("hipMemcpy2DToArray", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpy2DArrayToArray",
+            ("hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy2DFromArray",
+            ("hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy2DFromArrayAsync",
+            ("hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy2DToArrayAsync",
+            ("hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpy3D", ("hipMemcpy3D", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpy3DAsync",
+            ("hipMemcpy3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy3DPeer",
+            ("hipMemcpy3DPeer", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpy3DPeerAsync",
+            ("hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpyArrayToArray",
+            ("hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemcpyFromArrayAsync",
+            ("hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemcpyFromSymbol", ("hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpyFromSymbolAsync",
+            ("hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaMemAdvise", ("hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaMemRangeGetAttribute",
+            ("hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeGetAttributes",
+            ("hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseSetReadMostly",
+            ("hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseUnsetReadMostly",
+            ("hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseSetPreferredLocation",
+            (
+                "hipMemAdviseSetPreferredLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaMemAdviseUnsetPreferredLocation",
+            (
+                "hipMemAdviseUnsetPreferredLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaMemAdviseSetAccessedBy",
+            ("hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemAdviseUnsetAccessedBy",
+            ("hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeAttributeReadMostly",
+            ("hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeAttributePreferredLocation",
+            (
+                "hipMemRangeAttributePreferredLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaMemRangeAttributeAccessedBy",
+            ("hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemRangeAttributeLastPrefetchLocation",
+            (
+                "hipMemRangeAttributeLastPrefetchLocation",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaMemcpyHostToHost", ("hipMemcpyHostToHost", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyHostToDevice", ("hipMemcpyHostToDevice", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyDeviceToHost", ("hipMemcpyDeviceToHost", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemcpyDeviceToDevice",
+            ("hipMemcpyDeviceToDevice", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaMemcpyDefault", ("hipMemcpyDefault", CONV_MEM, API_RUNTIME)),
+        ("cudaMemset", ("hipMemset", CONV_MEM, API_RUNTIME)),
+        ("cudaMemsetAsync", ("hipMemsetAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemset2D", ("hipMemset2D", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMemset2DAsync",
+            ("hipMemset2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemset3D", ("hipMemset3D", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaMemset3DAsync",
+            ("hipMemset3DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMemGetInfo", ("hipMemGetInfo", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaArrayGetInfo",
+            ("hipArrayGetInfo", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFreeMipmappedArray",
+            ("hipFreeMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetMipmappedArrayLevel",
+            ("hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSymbolAddress",
+            ("hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSymbolSize",
+            ("hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMemPrefetchAsync",
+            ("hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMallocHost", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
+        ("cudaMallocArray", ("hipMallocArray", CONV_MEM, API_RUNTIME)),
+        ("cudaMalloc", ("hipMalloc", CONV_MEM, API_RUNTIME)),
+        ("cudaMalloc3D", ("hipMalloc3D", CONV_MEM, API_RUNTIME)),
+        ("cudaMalloc3DArray", ("hipMalloc3DArray", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaMallocManaged",
+            ("hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaMallocMipmappedArray",
+            ("hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaMallocPitch", ("hipMallocPitch", CONV_MEM, API_RUNTIME)),
+        ("cudaFreeHost", ("hipHostFree", CONV_MEM, API_RUNTIME)),
+        ("cudaFreeArray", ("hipFreeArray", CONV_MEM, API_RUNTIME)),
+        ("cudaFree", ("hipFree", CONV_MEM, API_RUNTIME)),
+        ("cudaHostRegister", ("hipHostRegister", CONV_MEM, API_RUNTIME)),
+        ("cudaHostUnregister", ("hipHostUnregister", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAlloc", ("hipHostMalloc", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryTypeHost", ("hipMemoryTypeHost", CONV_MEM, API_RUNTIME)),
+        ("cudaMemoryTypeDevice", ("hipMemoryTypeDevice", CONV_MEM, API_RUNTIME)),
+        ("make_cudaExtent", ("make_hipExtent", CONV_MEM, API_RUNTIME)),
+        ("make_cudaPitchedPtr", ("make_hipPitchedPtr", CONV_MEM, API_RUNTIME)),
+        ("make_cudaPos", ("make_hipPos", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAllocDefault", ("hipHostMallocDefault", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAllocPortable", ("hipHostMallocPortable", CONV_MEM, API_RUNTIME)),
+        ("cudaHostAllocMapped", ("hipHostMallocMapped", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaHostAllocWriteCombined",
+            ("hipHostMallocWriteCombined", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaHostGetFlags", ("hipHostGetFlags", CONV_MEM, API_RUNTIME)),
+        ("cudaHostRegisterDefault", ("hipHostRegisterDefault", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaHostRegisterPortable",
+            ("hipHostRegisterPortable", CONV_MEM, API_RUNTIME),
+        ),
+        ("cudaHostRegisterMapped", ("hipHostRegisterMapped", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaHostRegisterIoMemory",
+            ("hipHostRegisterIoMemory", CONV_MEM, API_RUNTIME),
+        ),
+        # ("warpSize", ("hipWarpSize", CONV_SPECIAL_FUNC, API_RUNTIME), (HIP actually uses warpSize...),
+        ("cudaEventCreate", ("hipEventCreate", CONV_EVENT, API_RUNTIME)),
+        (
+            "cudaEventCreateWithFlags",
+            ("hipEventCreateWithFlags", CONV_EVENT, API_RUNTIME),
+        ),
+        ("cudaEventDestroy", ("hipEventDestroy", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventRecord", ("hipEventRecord", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventElapsedTime", ("hipEventElapsedTime", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventSynchronize", ("hipEventSynchronize", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventQuery", ("hipEventQuery", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventDefault", ("hipEventDefault", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventBlockingSync", ("hipEventBlockingSync", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventDisableTiming", ("hipEventDisableTiming", CONV_EVENT, API_RUNTIME)),
+        ("cudaEventInterprocess", ("hipEventInterprocess", CONV_EVENT, API_RUNTIME)),
+        ("cudaStreamCreate", ("hipStreamCreate", CONV_STREAM, API_RUNTIME)),
+        (
+            "cudaStreamCreateWithFlags",
+            ("hipStreamCreateWithFlags", CONV_STREAM, API_RUNTIME),
+        ),
+        (
+            "cudaStreamCreateWithPriority",
+            ("hipStreamCreateWithPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaStreamDestroy", ("hipStreamDestroy", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamWaitEvent", ("hipStreamWaitEvent", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamSynchronize", ("hipStreamSynchronize", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamGetFlags", ("hipStreamGetFlags", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamQuery", ("hipStreamQuery", CONV_STREAM, API_RUNTIME)),
+        ("cudaStreamAddCallback", ("hipStreamAddCallback", CONV_STREAM, API_RUNTIME)),
+        (
+            "cudaStreamAttachMemAsync",
+            ("hipStreamAttachMemAsync", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaStreamGetPriority",
+            ("hipStreamGetPriority", CONV_STREAM, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaStreamDefault", ("hipStreamDefault", CONV_TYPE, API_RUNTIME)),
+        ("cudaStreamNonBlocking", ("hipStreamNonBlocking", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceSynchronize", ("hipDeviceSynchronize", CONV_DEVICE, API_RUNTIME)),
+        ("cudaDeviceReset", ("hipDeviceReset", CONV_DEVICE, API_RUNTIME)),
+        ("cudaSetDevice", ("hipSetDevice", CONV_DEVICE, API_RUNTIME)),
+        ("cudaGetDevice", ("hipGetDevice", CONV_DEVICE, API_RUNTIME)),
+        ("cudaGetDeviceCount", ("hipGetDeviceCount", CONV_DEVICE, API_RUNTIME)),
+        ("cudaChooseDevice", ("hipChooseDevice", CONV_DEVICE, API_RUNTIME)),
+        ("cudaThreadExit", ("hipDeviceReset", CONV_THREAD, API_RUNTIME)),
+        (
+            "cudaThreadGetCacheConfig",
+            ("hipDeviceGetCacheConfig", CONV_THREAD, API_RUNTIME),
+        ),
+        (
+            "cudaThreadGetLimit",
+            ("hipThreadGetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaThreadSetCacheConfig",
+            ("hipDeviceSetCacheConfig", CONV_THREAD, API_RUNTIME),
+        ),
+        (
+            "cudaThreadSetLimit",
+            ("hipThreadSetLimit", CONV_THREAD, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaThreadSynchronize", ("hipDeviceSynchronize", CONV_THREAD, API_RUNTIME)),
+        ("cudaDeviceGetAttribute", ("hipDeviceGetAttribute", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaDevAttrMaxThreadsPerBlock",
+            ("hipDeviceAttributeMaxThreadsPerBlock", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxBlockDimX",
+            ("hipDeviceAttributeMaxBlockDimX", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxBlockDimY",
+            ("hipDeviceAttributeMaxBlockDimY", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxBlockDimZ",
+            ("hipDeviceAttributeMaxBlockDimZ", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxGridDimX",
+            ("hipDeviceAttributeMaxGridDimX", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxGridDimY",
+            ("hipDeviceAttributeMaxGridDimY", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxGridDimZ",
+            ("hipDeviceAttributeMaxGridDimZ", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxSharedMemoryPerBlock",
+            ("hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrTotalConstantMemory",
+            ("hipDeviceAttributeTotalConstantMemory", CONV_TYPE, API_RUNTIME),
+        ),
+        ("cudaDevAttrWarpSize", ("hipDeviceAttributeWarpSize", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDevAttrMaxPitch",
+            ("hipDeviceAttributeMaxPitch", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrMaxRegistersPerBlock",
+            ("hipDeviceAttributeMaxRegistersPerBlock", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrClockRate",
+            ("hipDeviceAttributeClockRate", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrTextureAlignment",
+            (
+                "hipDeviceAttributeTextureAlignment",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrGpuOverlap",
+            ("hipDeviceAttributeGpuOverlap", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrMultiProcessorCount",
+            ("hipDeviceAttributeMultiprocessorCount", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrKernelExecTimeout",
+            (
+                "hipDeviceAttributeKernelExecTimeout",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrIntegrated",
+            ("hipDeviceAttributeIntegrated", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrCanMapHostMemory",
+            (
+                "hipDeviceAttributeCanMapHostMemory",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrComputeMode",
+            ("hipDeviceAttributeComputeMode", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DWidth",
+            (
+                "hipDeviceAttributeMaxTexture3DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DHeight",
+            (
+                "hipDeviceAttributeMaxTexture3DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DDepth",
+            (
+                "hipDeviceAttributeMaxTexture3DDepth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLayeredHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxTexture2DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrSurfaceAlignment",
+            (
+                "hipDeviceAttributeSurfaceAlignment",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrConcurrentKernels",
+            ("hipDeviceAttributeConcurrentKernels", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrEccEnabled",
+            ("hipDeviceAttributeEccEnabled", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDevAttrPciBusId", ("hipDeviceAttributePciBusId", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDevAttrPciDeviceId",
+            ("hipDeviceAttributePciDeviceId", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrTccDriver",
+            ("hipDeviceAttributeTccDriver", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrMemoryClockRate",
+            ("hipDeviceAttributeMemoryClockRate", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrGlobalMemoryBusWidth",
+            ("hipDeviceAttributeMemoryBusWidth", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrL2CacheSize",
+            ("hipDeviceAttributeL2CacheSize", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxThreadsPerMultiProcessor",
+            ("hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrAsyncEngineCount",
+            (
+                "hipDeviceAttributeAsyncEngineCount",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrUnifiedAddressing",
+            (
+                "hipDeviceAttributeUnifiedAddressing",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxTexture1DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DGatherWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DGatherHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DGatherHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DWidthAlt",
+            (
+                "hipDeviceAttributeMaxTexture3DWidthAlternate",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DHeightAlt",
+            (
+                "hipDeviceAttributeMaxTexture3DHeightAlternate",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture3DDepthAlt",
+            (
+                "hipDeviceAttributeMaxTexture3DDepthAlternate",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrPciDomainId",
+            ("hipDeviceAttributePciDomainId", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevAttrTexturePitchAlignment",
+            (
+                "hipDeviceAttributeTexturePitchAlignment",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTextureCubemapWidth",
+            (
+                "hipDeviceAttributeMaxTextureCubemapWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTextureCubemapLayeredWidth",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTextureCubemapLayeredLayers",
+            (
+                "hipDeviceAttributeMaxTextureCubemapLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface1DWidth",
+            (
+                "hipDeviceAttributeMaxSurface1DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DWidth",
+            (
+                "hipDeviceAttributeMaxSurface2DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DHeight",
+            (
+                "hipDeviceAttributeMaxSurface2DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface3DWidth",
+            (
+                "hipDeviceAttributeMaxSurface3DWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface3DHeight",
+            (
+                "hipDeviceAttributeMaxSurface3DHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface3DDepth",
+            (
+                "hipDeviceAttributeMaxSurface3DDepth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface1DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface1DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxSurface1DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DLayeredWidth",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DLayeredHeight",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurface2DLayeredLayers",
+            (
+                "hipDeviceAttributeMaxSurface2DLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurfaceCubemapWidth",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurfaceCubemapLayeredWidth",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSurfaceCubemapLayeredLayers",
+            (
+                "hipDeviceAttributeMaxSurfaceCubemapLayeredLayers",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DLinearWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DLinearWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLinearWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLinearHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DLinearPitch",
+            (
+                "hipDeviceAttributeMaxTexture2DLinearPitch",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DMipmappedWidth",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxTexture2DMipmappedHeight",
+            (
+                "hipDeviceAttributeMaxTexture2DMipmappedHeight",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrComputeCapabilityMajor",
+            ("hipDeviceAttributeComputeCapabilityMajor", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrComputeCapabilityMinor",
+            ("hipDeviceAttributeComputeCapabilityMinor", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMaxTexture1DMipmappedWidth",
+            (
+                "hipDeviceAttributeMaxTexture1DMipmappedWidth",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrStreamPrioritiesSupported",
+            (
+                "hipDeviceAttributeStreamPrioritiesSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrGlobalL1CacheSupported",
+            (
+                "hipDeviceAttributeGlobalL1CacheSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrLocalL1CacheSupported",
+            (
+                "hipDeviceAttributeLocalL1CacheSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxSharedMemoryPerMultiprocessor",
+            (
+                "hipDeviceAttributeMaxSharedMemoryPerMultiprocessor",
+                CONV_TYPE,
+                API_RUNTIME,
+            ),
+        ),
+        (
+            "cudaDevAttrMaxRegistersPerMultiprocessor",
+            (
+                "hipDeviceAttributeMaxRegistersPerMultiprocessor",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrManagedMemory",
+            (
+                "hipDeviceAttributeManagedMemory",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrIsMultiGpuBoard",
+            ("hipDeviceAttributeIsMultiGpuBoard", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDevAttrMultiGpuBoardGroupID",
+            (
+                "hipDeviceAttributeMultiGpuBoardGroupID",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrHostNativeAtomicSupported",
+            (
+                "hipDeviceAttributeHostNativeAtomicSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrSingleToDoublePrecisionPerfRatio",
+            (
+                "hipDeviceAttributeSingleToDoublePrecisionPerfRatio",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrPageableMemoryAccess",
+            (
+                "hipDeviceAttributePageableMemoryAccess",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrConcurrentManagedAccess",
+            (
+                "hipDeviceAttributeConcurrentManagedAccess",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrComputePreemptionSupported",
+            (
+                "hipDeviceAttributeComputePreemptionSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevAttrCanUseHostPointerForRegisteredMem",
+            (
+                "hipDeviceAttributeCanUseHostPointerForRegisteredMem",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaPointerGetAttributes",
+            ("hipPointerGetAttributes", CONV_MEM, API_RUNTIME),
+        ),
+        (
+            "cudaHostGetDevicePointer",
+            ("hipHostGetDevicePointer", CONV_MEM, API_RUNTIME),
+        ),
+        (
+            "cudaGetDeviceProperties",
+            ("hipGetDeviceProperties", CONV_DEVICE, API_RUNTIME),
+        ),
+        ("cudaDeviceGetPCIBusId", ("hipDeviceGetPCIBusId", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaDeviceGetByPCIBusId",
+            ("hipDeviceGetByPCIBusId", CONV_DEVICE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceGetStreamPriorityRange",
+            (
+                "hipDeviceGetStreamPriorityRange",
+                CONV_DEVICE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaSetValidDevices",
+            ("hipSetValidDevices", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDevP2PAttrPerformanceRank",
+            (
+                "hipDeviceP2PAttributePerformanceRank",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevP2PAttrAccessSupported",
+            (
+                "hipDeviceP2PAttributeAccessSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDevP2PAttrNativeAtomicSupported",
+            (
+                "hipDeviceP2PAttributeNativeAtomicSupported",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaDeviceGetP2PAttribute",
+            ("hipDeviceGetP2PAttribute", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeDefault",
+            ("hipComputeModeDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeExclusive",
+            ("hipComputeModeExclusive", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeProhibited",
+            ("hipComputeModeProhibited", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaComputeModeExclusiveProcess",
+            ("hipComputeModeExclusiveProcess", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetDeviceFlags",
+            ("hipGetDeviceFlags", CONV_DEVICE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaSetDeviceFlags", ("hipSetDeviceFlags", CONV_DEVICE, API_RUNTIME)),
+        ("cudaDeviceScheduleAuto", ("hipDeviceScheduleAuto", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceScheduleSpin", ("hipDeviceScheduleSpin", CONV_TYPE, API_RUNTIME)),
+        ("cudaDeviceScheduleYield", ("hipDeviceScheduleYield", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDeviceBlockingSync",
+            ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceScheduleBlockingSync",
+            ("hipDeviceScheduleBlockingSync", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceScheduleMask",
+            ("hipDeviceScheduleMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDeviceMapHost", ("hipDeviceMapHost", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaDeviceLmemResizeToMax",
+            ("hipDeviceLmemResizeToMax", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDeviceMask", ("hipDeviceMask", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaDeviceSetCacheConfig",
+            ("hipDeviceSetCacheConfig", CONV_CACHE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceGetCacheConfig",
+            ("hipDeviceGetCacheConfig", CONV_CACHE, API_RUNTIME),
+        ),
+        ("cudaFuncSetCacheConfig", ("hipFuncSetCacheConfig", CONV_CACHE, API_RUNTIME)),
+        (
+            "cudaFuncCachePreferNone",
+            ("hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncCachePreferShared",
+            ("hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME),
+        ),
+        ("cudaFuncCachePreferL1", ("hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME)),
+        (
+            "cudaFuncCachePreferEqual",
+            ("hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME),
+        ),
+        (
+            "cudaFuncGetAttributes",
+            ("hipFuncGetAttributes", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFuncSetSharedMemConfig",
+            ("hipFuncSetSharedMemConfig", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetParameterBuffer",
+            ("hipGetParameterBuffer", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaSetDoubleForDevice",
+            ("hipSetDoubleForDevice", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaSetDoubleForHost",
+            ("hipSetDoubleForHost", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaConfigureCall",
+            ("hipConfigureCall", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaLaunch", ("hipLaunch", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED)),
+        (
+            "cudaSetupArgument",
+            ("hipSetupArgument", CONV_EXEC, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaDriverGetVersion", ("hipDriverGetVersion", CONV_VERSION, API_RUNTIME)),
+        (
+            "cudaRuntimeGetVersion",
+            ("hipRuntimeGetVersion", CONV_VERSION, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSize",
+            ("hipOccupancyMaxPotentialBlockSize", CONV_OCCUPANCY, API_RUNTIME),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSizeWithFlags",
+            (
+                "hipOccupancyMaxPotentialBlockSizeWithFlags",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxActiveBlocksPerMultiprocessor",
+            (
+                "hipOccupancyMaxActiveBlocksPerMultiprocessor",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+            (
+                "hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSizeVariableSMem",
+            (
+                "hipOccupancyMaxPotentialBlockSizeVariableSMem",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags",
+            (
+                "hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags",
+                CONV_OCCUPANCY,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaDeviceCanAccessPeer", ("hipDeviceCanAccessPeer", CONV_PEER, API_RUNTIME)),
+        (
+            "cudaDeviceDisablePeerAccess",
+            ("hipDeviceDisablePeerAccess", CONV_PEER, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceEnablePeerAccess",
+            ("hipDeviceEnablePeerAccess", CONV_PEER, API_RUNTIME),
+        ),
+        ("cudaMemcpyPeerAsync", ("hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME)),
+        ("cudaMemcpyPeer", ("hipMemcpyPeer", CONV_MEM, API_RUNTIME)),
+        (
+            "cudaIpcMemLazyEnablePeerAccess",
+            ("hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceSetSharedMemConfig",
+            ("hipDeviceSetSharedMemConfig", CONV_DEVICE, API_RUNTIME),
+        ),
+        (
+            "cudaDeviceGetSharedMemConfig",
+            ("hipDeviceGetSharedMemConfig", CONV_DEVICE, API_RUNTIME),
+        ),
+        (
+            "cudaSharedMemBankSizeDefault",
+            ("hipSharedMemBankSizeDefault", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaSharedMemBankSizeFourByte",
+            ("hipSharedMemBankSizeFourByte", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaSharedMemBankSizeEightByte",
+            ("hipSharedMemBankSizeEightByte", CONV_TYPE, API_RUNTIME),
+        ),
+        (
+            "cudaLimitStackSize",
+            ("hipLimitStackSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaLimitPrintfFifoSize",
+            ("hipLimitPrintfFifoSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaLimitMallocHeapSize", ("hipLimitMallocHeapSize", CONV_TYPE, API_RUNTIME)),
+        (
+            "cudaLimitDevRuntimeSyncDepth",
+            ("hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaLimitDevRuntimePendingLaunchCount",
+            (
+                "hipLimitDevRuntimePendingLaunchCount",
+                CONV_TYPE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaDeviceGetLimit", ("hipDeviceGetLimit", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaProfilerInitialize",
+            ("hipProfilerInitialize", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaProfilerStart", ("hipProfilerStart", CONV_OTHER, API_RUNTIME)),
+        ("cudaProfilerStop", ("hipProfilerStop", CONV_OTHER, API_RUNTIME)),
+        (
+            "cudaKeyValuePair",
+            ("hipKeyValuePair", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        ("cudaCSV", ("hipCSV", CONV_OTHER, API_RUNTIME, HIP_UNSUPPORTED)),
+        ("cudaReadModeElementType", ("hipReadModeElementType", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaReadModeNormalizedFloat",
+            ("hipReadModeNormalizedFloat", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaFilterModePoint", ("hipFilterModePoint", CONV_TEX, API_RUNTIME)),
+        ("cudaFilterModeLinear", ("hipFilterModeLinear", CONV_TEX, API_RUNTIME)),
+        ("cudaBindTexture", ("hipBindTexture", CONV_TEX, API_RUNTIME)),
+        ("cudaUnbindTexture", ("hipUnbindTexture", CONV_TEX, API_RUNTIME)),
+        ("cudaBindTexture2D", ("hipBindTexture2D", CONV_TEX, API_RUNTIME)),
+        ("cudaBindTextureToArray", ("hipBindTextureToArray", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaBindTextureToMipmappedArray",
+            ("hipBindTextureToMipmappedArray", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureAlignmentOffset",
+            ("hipGetTextureAlignmentOffset", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaGetTextureReference", ("hipGetTextureReference", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaChannelFormatKindSigned",
+            ("hipChannelFormatKindSigned", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaChannelFormatKindUnsigned",
+            ("hipChannelFormatKindUnsigned", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaChannelFormatKindFloat",
+            ("hipChannelFormatKindFloat", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaChannelFormatKindNone",
+            ("hipChannelFormatKindNone", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaCreateChannelDesc", ("hipCreateChannelDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaGetChannelDesc", ("hipGetChannelDesc", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceTypeArray", ("hipResourceTypeArray", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaResourceTypeMipmappedArray",
+            ("hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaResourceTypeLinear", ("hipResourceTypeLinear", CONV_TEX, API_RUNTIME)),
+        ("cudaResourceTypePitch2D", ("hipResourceTypePitch2D", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatNone", ("hipResViewFormatNone", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaResViewFormatUnsignedChar1",
+            ("hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedChar2",
+            ("hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedChar4",
+            ("hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedChar1",
+            ("hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedChar2",
+            ("hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedChar4",
+            ("hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedShort1",
+            ("hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedShort2",
+            ("hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedShort4",
+            ("hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedShort1",
+            ("hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedShort2",
+            ("hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedShort4",
+            ("hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedInt1",
+            ("hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedInt2",
+            ("hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedInt4",
+            ("hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedInt1",
+            ("hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedInt2",
+            ("hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedInt4",
+            ("hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaResViewFormatHalf1", ("hipResViewFormatHalf1", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatHalf2", ("hipResViewFormatHalf2", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatHalf4", ("hipResViewFormatHalf4", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatFloat1", ("hipResViewFormatFloat1", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatFloat2", ("hipResViewFormatFloat2", CONV_TEX, API_RUNTIME)),
+        ("cudaResViewFormatFloat4", ("hipResViewFormatFloat4", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed1",
+            ("hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed2",
+            ("hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed3",
+            ("hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed4",
+            ("hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedBlockCompressed4",
+            ("hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed5",
+            ("hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedBlockCompressed5",
+            ("hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed6H",
+            ("hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatSignedBlockCompressed6H",
+            ("hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaResViewFormatUnsignedBlockCompressed7",
+            ("hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME),
+        ),
+        ("cudaAddressModeWrap", ("hipAddressModeWrap", CONV_TEX, API_RUNTIME)),
+        ("cudaAddressModeClamp", ("hipAddressModeClamp", CONV_TEX, API_RUNTIME)),
+        ("cudaAddressModeMirror", ("hipAddressModeMirror", CONV_TEX, API_RUNTIME)),
+        ("cudaAddressModeBorder", ("hipAddressModeBorder", CONV_TEX, API_RUNTIME)),
+        ("cudaCreateTextureObject", ("hipCreateTextureObject", CONV_TEX, API_RUNTIME)),
+        (
+            "cudaDestroyTextureObject",
+            ("hipDestroyTextureObject", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureObjectResourceDesc",
+            ("hipGetTextureObjectResourceDesc", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureObjectResourceViewDesc",
+            ("hipGetTextureObjectResourceViewDesc", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaGetTextureObjectTextureDesc",
+            ("hipGetTextureObjectTextureDesc", CONV_TEX, API_RUNTIME),
+        ),
+        (
+            "cudaBindSurfaceToArray",
+            ("hipBindSurfaceToArray", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSurfaceReference",
+            ("hipGetSurfaceReference", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaBoundaryModeZero",
+            ("hipBoundaryModeZero", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaBoundaryModeClamp",
+            ("hipBoundaryModeClamp", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaBoundaryModeTrap",
+            ("hipBoundaryModeTrap", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFormatModeForced",
+            ("hipFormatModeForced", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaFormatModeAuto",
+            ("hipFormatModeAuto", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaCreateSurfaceObject",
+            ("hipCreateSurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaDestroySurfaceObject",
+            ("hipDestroySurfaceObject", CONV_SURFACE, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGetSurfaceObjectResourceDesc",
+            (
+                "hipGetSurfaceObjectResourceDesc",
+                CONV_SURFACE,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cudaIpcCloseMemHandle", ("hipIpcCloseMemHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcGetEventHandle", ("hipIpcGetEventHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcGetMemHandle", ("hipIpcGetMemHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcOpenEventHandle", ("hipIpcOpenEventHandle", CONV_DEVICE, API_RUNTIME)),
+        ("cudaIpcOpenMemHandle", ("hipIpcOpenMemHandle", CONV_DEVICE, API_RUNTIME)),
+        (
+            "cudaGLGetDevices",
+            ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterBuffer",
+            ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterImage",
+            ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaWGLGetDevice",
+            ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsMapResources",
+            ("hipGraphicsMapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsResourceGetMappedMipmappedArray",
+            (
+                "hipGraphicsResourceGetMappedMipmappedArray",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsResourceGetMappedPointer",
+            (
+                "hipGraphicsResourceGetMappedPointer",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsResourceSetMapFlags",
+            (
+                "hipGraphicsResourceSetMapFlags",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsSubResourceGetMappedArray",
+            (
+                "hipGraphicsSubResourceGetMappedArray",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsUnmapResources",
+            ("hipGraphicsUnmapResources", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsUnregisterResource",
+            (
+                "hipGraphicsUnregisterResource",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFacePositiveX",
+            (
+                "hipGraphicsCubeFacePositiveX",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFaceNegativeX",
+            (
+                "hipGraphicsCubeFaceNegativeX",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFacePositiveY",
+            (
+                "hipGraphicsCubeFacePositiveY",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFaceNegativeY",
+            (
+                "hipGraphicsCubeFaceNegativeY",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFacePositiveZ",
+            (
+                "hipGraphicsCubeFacePositiveZ",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsCubeFaceNegativeZ",
+            (
+                "hipGraphicsCubeFaceNegativeZ",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsMapFlagsNone",
+            ("hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsMapFlagsReadOnly",
+            (
+                "hipGraphicsMapFlagsReadOnly",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsMapFlagsWriteDiscard",
+            (
+                "hipGraphicsMapFlagsWriteDiscard",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsNone",
+            (
+                "hipGraphicsRegisterFlagsNone",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsReadOnly",
+            (
+                "hipGraphicsRegisterFlagsReadOnly",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsWriteDiscard",
+            (
+                "hipGraphicsRegisterFlagsWriteDiscard",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsSurfaceLoadStore",
+            (
+                "hipGraphicsRegisterFlagsSurfaceLoadStore",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsRegisterFlagsTextureGather",
+            (
+                "hipGraphicsRegisterFlagsTextureGather",
+                CONV_GRAPHICS,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGLDeviceListAll",
+            ("HIP_GL_DEVICE_LIST_ALL", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLDeviceListCurrentFrame",
+            ("HIP_GL_DEVICE_LIST_CURRENT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLDeviceListNextFrame",
+            ("HIP_GL_DEVICE_LIST_NEXT_FRAME", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLGetDevices",
+            ("hipGLGetDevices", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterBuffer",
+            ("hipGraphicsGLRegisterBuffer", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsGLRegisterImage",
+            ("hipGraphicsGLRegisterImage", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaWGLGetDevice",
+            ("hipWGLGetDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLMapFlagsNone",
+            ("HIP_GL_MAP_RESOURCE_FLAGS_NONE", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLMapFlagsReadOnly",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_READ_ONLY",
+                CONV_GL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGLMapFlagsWriteDiscard",
+            (
+                "HIP_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD",
+                CONV_GL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGLMapBufferObject",
+            ("hipGLMapBufferObject__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLMapBufferObjectAsync",
+            ("hipGLMapBufferObjectAsync__", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLRegisterBufferObject",
+            ("hipGLRegisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLSetBufferObjectMapFlags",
+            ("hipGLSetBufferObjectMapFlags", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLSetGLDevice",
+            ("hipGLSetGLDevice", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLUnmapBufferObject",
+            ("hipGLUnmapBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLUnmapBufferObjectAsync",
+            ("hipGLUnmapBufferObjectAsync", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGLUnregisterBufferObject",
+            ("hipGLUnregisterBufferObject", CONV_GL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9DeviceListAll",
+            ("HIP_D3D9_DEVICE_LIST_ALL", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9DeviceListCurrentFrame",
+            (
+                "HIP_D3D9_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9DeviceListNextFrame",
+            (
+                "HIP_D3D9_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9GetDevice",
+            ("hipD3D9GetDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9GetDevices",
+            ("hipD3D9GetDevices", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9GetDirect3DDevice",
+            ("hipD3D9GetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9SetDirect3DDevice",
+            ("hipD3D9SetDirect3DDevice", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D9RegisterResource",
+            (
+                "hipGraphicsD3D9RegisterResource",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9MapFlags",
+            ("hipD3D9MapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9MapFlagsNone",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_NONE",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9MapFlagsReadOnly",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9MapFlagsWriteDiscard",
+            (
+                "HIP_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9RegisterFlagsNone",
+            ("HIP_D3D9_REGISTER_FLAGS_NONE", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9RegisterFlagsArray",
+            ("HIP_D3D9_REGISTER_FLAGS_ARRAY", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9MapResources",
+            ("hipD3D9MapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9RegisterResource",
+            ("hipD3D9RegisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedArray",
+            ("hipD3D9ResourceGetMappedArray", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedPitch",
+            ("hipD3D9ResourceGetMappedPitch", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedPointer",
+            (
+                "hipD3D9ResourceGetMappedPointer",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9ResourceGetMappedSize",
+            ("hipD3D9ResourceGetMappedSize", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9ResourceGetSurfaceDimensions",
+            (
+                "hipD3D9ResourceGetSurfaceDimensions",
+                CONV_D3D9,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D9ResourceSetMapFlags",
+            ("hipD3D9ResourceSetMapFlags", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9UnmapResources",
+            ("hipD3D9UnmapResources", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D9UnregisterResource",
+            ("hipD3D9UnregisterResource", CONV_D3D9, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10DeviceListAll",
+            ("HIP_D3D10_DEVICE_LIST_ALL", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10DeviceListCurrentFrame",
+            (
+                "HIP_D3D10_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10DeviceListNextFrame",
+            (
+                "HIP_D3D10_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10GetDevice",
+            ("hipD3D10GetDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10GetDevices",
+            ("hipD3D10GetDevices", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D10RegisterResource",
+            (
+                "hipGraphicsD3D10RegisterResource",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10MapFlagsNone",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_NONE",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10MapFlagsReadOnly",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_READONLY",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10MapFlagsWriteDiscard",
+            (
+                "HIP_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10RegisterFlagsNone",
+            ("HIP_D3D10_REGISTER_FLAGS_NONE", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10RegisterFlagsArray",
+            (
+                "HIP_D3D10_REGISTER_FLAGS_ARRAY",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10GetDirect3DDevice",
+            ("hipD3D10GetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10MapResources",
+            ("hipD3D10MapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10RegisterResource",
+            ("hipD3D10RegisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedArray",
+            (
+                "hipD3D10ResourceGetMappedArray",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedPitch",
+            (
+                "hipD3D10ResourceGetMappedPitch",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedPointer",
+            (
+                "hipD3D10ResourceGetMappedPointer",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceGetMappedSize",
+            ("hipD3D10ResourceGetMappedSize", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10ResourceGetSurfaceDimensions",
+            (
+                "hipD3D10ResourceGetSurfaceDimensions",
+                CONV_D3D10,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D10ResourceSetMapFlags",
+            ("hipD3D10ResourceSetMapFlags", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10SetDirect3DDevice",
+            ("hipD3D10SetDirect3DDevice", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10UnmapResources",
+            ("hipD3D10UnmapResources", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D10UnregisterResource",
+            ("hipD3D10UnregisterResource", CONV_D3D10, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11DeviceListAll",
+            ("HIP_D3D11_DEVICE_LIST_ALL", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11DeviceListCurrentFrame",
+            (
+                "HIP_D3D11_DEVICE_LIST_CURRENT_FRAME",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D11DeviceListNextFrame",
+            (
+                "HIP_D3D11_DEVICE_LIST_NEXT_FRAME",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D11GetDevice",
+            ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11GetDevices",
+            ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D11RegisterResource",
+            (
+                "hipGraphicsD3D11RegisterResource",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaD3D11GetDevice",
+            ("hipD3D11GetDevice", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaD3D11GetDevices",
+            ("hipD3D11GetDevices", CONV_D3D11, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsD3D11RegisterResource",
+            (
+                "hipGraphicsD3D11RegisterResource",
+                CONV_D3D11,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsVDPAURegisterOutputSurface",
+            (
+                "hipGraphicsVDPAURegisterOutputSurface",
+                CONV_VDPAU,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaGraphicsVDPAURegisterVideoSurface",
+            (
+                "hipGraphicsVDPAURegisterVideoSurface",
+                CONV_VDPAU,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaVDPAUGetDevice",
+            ("hipVDPAUGetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaVDPAUSetVDPAUDevice",
+            ("hipVDPAUSetDevice", CONV_VDPAU, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamConsumerAcquireFrame",
+            (
+                "hipEGLStreamConsumerAcquireFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamConsumerConnect",
+            ("hipEGLStreamConsumerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamConsumerConnectWithFlags",
+            (
+                "hipEGLStreamConsumerConnectWithFlags",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamConsumerReleaseFrame",
+            (
+                "hipEGLStreamConsumerReleaseFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamProducerConnect",
+            ("hipEGLStreamProducerConnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamProducerDisconnect",
+            ("hipEGLStreamProducerDisconnect", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaEGLStreamProducerPresentFrame",
+            (
+                "hipEGLStreamProducerPresentFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cudaEGLStreamProducerReturnFrame",
+            ("hipEGLStreamProducerReturnFrame", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsEGLRegisterImage",
+            ("hipGraphicsEGLRegisterImage", CONV_EGL, API_RUNTIME, HIP_UNSUPPORTED),
+        ),
+        (
+            "cudaGraphicsResourceGetMappedEglFrame",
+            (
+                "hipGraphicsResourceGetMappedEglFrame",
+                CONV_EGL,
+                API_RUNTIME,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cublasInit", ("rocblas_init", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasShutdown",
+            ("rocblas_shutdown", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetVersion",
+            ("rocblas_get_version", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetError",
+            ("rocblas_get_error", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasAlloc", ("rocblas_alloc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasFree", ("rocblas_free", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSetKernelStream",
+            ("rocblas_set_kernel_stream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetAtomicsMode",
+            ("rocblas_get_atomics_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSetAtomicsMode",
+            ("rocblas_set_atomics_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetMathMode",
+            ("rocblas_get_math_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSetMathMode",
+            ("rocblas_set_math_mode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("CUBLAS_OP_N", ("rocblas_operation_none", CONV_NUMERIC_LITERAL, API_BLAS)),
+        (
+            "CUBLAS_OP_T",
+            ("rocblas_operation_transpose", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_OP_C",
+            ("rocblas_operation_conjugate_transpose", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_SUCCESS",
+            ("rocblas_status_success", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_NOT_INITIALIZED",
+            ("rocblas_status_invalid_handle", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_ALLOC_FAILED",
+            ("rocblas_status_memory_error", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_INVALID_VALUE",
+            ("rocblas_status_invalid_pointer", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_MAPPING_ERROR",
+            ("rocblas_status_internal_error", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_EXECUTION_FAILED",
+            ("rocblas_status_internal_error", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_INTERNAL_ERROR",
+            ("rocblas_status_internal_error", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_NOT_SUPPORTED",
+            ("rocblas_status_not_implemented", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_STATUS_ARCH_MISMATCH",
+            ("rocblas_status_not_implemented", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_FILL_MODE_LOWER",
+            ("rocblas_fill_lower", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_FILL_MODE_UPPER",
+            ("rocblas_fill_upper", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_DIAG_NON_UNIT",
+            ("rocblas_diagonal_non_unit", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        ("CUBLAS_DIAG_UNIT", ("rocblas_diagonal_unit", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLAS_SIDE_LEFT", ("rocblas_side_left", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUBLAS_SIDE_RIGHT", ("rocblas_side_right", CONV_NUMERIC_LITERAL, API_BLAS)),
+        (
+            "CUBLAS_POINTER_MODE_HOST",
+            ("rocblas_pointer_mode_host", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_POINTER_MODE_DEVICE",
+            ("rocblas_pointer_mode_device", CONV_NUMERIC_LITERAL, API_BLAS),
+        ),
+        (
+            "CUBLAS_ATOMICS_NOT_ALLOWED",
+            (
+                "rocblas_atomics_not_allowed",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_ATOMICS_ALLOWED",
+            (
+                "rocblas_atomics_allowed",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_DATA_FLOAT",
+            (
+                "rocblas_precision_float",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_DATA_DOUBLE",
+            (
+                "rocblas_precision_double",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "CUBLAS_DATA_HALF",
+            ("rocblas_precision_half", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUBLAS_DATA_INT8",
+            ("rocblas_precision_int8", CONV_NUMERIC_LITERAL, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasCreate", ("rocblas_create_handle", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDestroy", ("rocblas_destroy_handle", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasSetVector", ("rocblas_set_vector", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetVector", ("rocblas_get_vector", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSetVectorAsync",
+            ("rocblas_set_vector_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasGetVectorAsync",
+            ("rocblas_get_vector_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSetMatrix", ("rocblas_set_matrix", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetMatrix", ("rocblas_get_matrix", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasGetMatrixAsync",
+            ("rocblas_get_matrix_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSetMatrixAsync",
+            ("rocblas_set_matrix_async", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasXerbla", ("rocblas_xerbla", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSnrm2", ("rocblas_snrm2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDnrm2", ("rocblas_dnrm2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasScnrm2", ("rocblas_scnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDznrm2", ("rocblas_dznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasNrm2Ex",
+            ("rocblas_nrm2_ex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSdot", ("rocblas_sdot", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSdotBatched",
+            ("rocblas_sdot_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDdot", ("rocblas_ddot", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDdotBatched",
+            ("rocblas_ddot_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasCdotu", ("rocblas_cdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCdotc", ("rocblas_cdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdotu", ("rocblas_zdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdotc", ("rocblas_zdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSscal", ("rocblas_sscal", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSscalBatched",
+            ("rocblas_sscal_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDscal", ("rocblas_dscal", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDscalBatched",
+            ("rocblas_dscal_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasCscal", ("rocblas_cscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsscal", ("rocblas_csscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZscal", ("rocblas_zscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdscal", ("rocblas_zdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSaxpy", ("rocblas_saxpy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSaxpyBatched",
+            ("rocblas_saxpy_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDaxpy", ("rocblas_daxpy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCaxpy", ("rocblas_caxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZaxpy", ("rocblas_zaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasScopy", ("rocblas_scopy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasScopyBatched",
+            ("rocblas_scopy_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDcopy", ("rocblas_dcopy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDcopyBatched",
+            ("rocblas_dcopy_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasCcopy", ("rocblas_ccopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZcopy", ("rocblas_zcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSswap", ("rocblas_sswap", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDswap", ("rocblas_dswap", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCswap", ("rocblas_cswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZswap", ("rocblas_zswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIsamax", ("rocblas_isamax", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamax", ("rocblas_idamax", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIcamax", ("rocblas_icamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIzamax", ("rocblas_izamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIsamin", ("rocblas_isamin", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamin", ("rocblas_idamin", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIcamin", ("rocblas_icamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasIzamin", ("rocblas_izamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSasum", ("rocblas_sasum", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSasumBatched",
+            ("rocblas_sasum_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDasum", ("rocblas_dasum", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasDasumBatched",
+            ("rocblas_dasum_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasScasum", ("rocblas_scasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDzasum", ("rocblas_dzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrot", ("rocblas_srot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrot", ("rocblas_drot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCrot", ("rocblas_crot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsrot", ("rocblas_csrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZrot", ("rocblas_zrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdrot", ("rocblas_zdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrotg", ("rocblas_srotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrotg", ("rocblas_drotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCrotg", ("rocblas_crotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZrotg", ("rocblas_zrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrotm", ("rocblas_srotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrotm", ("rocblas_drotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSrotmg", ("rocblas_srotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrotmg", ("rocblas_drotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSgemv", ("rocblas_sgemv", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasSgemvBatched",
+            ("rocblas_sgemv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDgemv", ("rocblas_dgemv", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgemv", ("rocblas_cgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgemv", ("rocblas_zgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSgbmv", ("rocblas_sgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDgbmv", ("rocblas_dgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCgbmv", ("rocblas_cgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgbmv", ("rocblas_zgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrmv", ("rocblas_strmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrmv", ("rocblas_dtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrmv", ("rocblas_ctrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrmv", ("rocblas_ztrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStbmv", ("rocblas_stbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtbmv", ("rocblas_dtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtbmv", ("rocblas_ctbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtbmv", ("rocblas_ztbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStpmv", ("rocblas_stpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtpmv", ("rocblas_dtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtpmv", ("rocblas_ctpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtpmv", ("rocblas_ztpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrsv", ("rocblas_strsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrsv", ("rocblas_dtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrsv", ("rocblas_ctrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrsv", ("rocblas_ztrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStpsv", ("rocblas_stpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtpsv", ("rocblas_dtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtpsv", ("rocblas_ctpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtpsv", ("rocblas_ztpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStbsv", ("rocblas_stbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtbsv", ("rocblas_dtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtbsv", ("rocblas_ctbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtbsv", ("rocblas_ztbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsymv", ("rocblas_ssymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsymv", ("rocblas_dsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsymv", ("rocblas_csymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsymv", ("rocblas_zsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChemv", ("rocblas_chemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhemv", ("rocblas_zhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsbmv", ("rocblas_ssbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsbmv", ("rocblas_dsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChbmv", ("rocblas_chbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhbmv", ("rocblas_zhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspmv", ("rocblas_sspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspmv", ("rocblas_dspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpmv", ("rocblas_chpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpmv", ("rocblas_zhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSger", ("rocblas_sger", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDger", ("rocblas_dger", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgeru", ("rocblas_cgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCgerc", ("rocblas_cgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgeru", ("rocblas_zgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgerc", ("rocblas_zgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyr", ("rocblas_ssyr", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDsyr", ("rocblas_dsyr", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCher", ("rocblas_cher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher", ("rocblas_zher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspr", ("rocblas_sspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspr", ("rocblas_dspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpr", ("rocblas_chpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpr", ("rocblas_zhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyr2", ("rocblas_ssyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyr2", ("rocblas_dsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCher2", ("rocblas_cher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher2", ("rocblas_zher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspr2", ("rocblas_sspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspr2", ("rocblas_dspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpr2", ("rocblas_chpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpr2", ("rocblas_zhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSgemmBatched",
+            ("rocblas_sgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgemmBatched",
+            ("rocblas_dgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasHgemmBatched",
+            ("rocblas_hgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgemmStridedBatched",
+            ("rocblas_sgemm_strided_batched", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasDgemmStridedBatched",
+            ("rocblas_dgemm_strided_batched", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasHgemmStridedBatched",
+            ("rocblas_hgemm_strided_batched", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasCgemmBatched",
+            ("rocblas_cgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemm3mBatched",
+            ("rocblas_cgemm_3m_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemmBatched",
+            ("rocblas_zgemm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemmStridedBatched",
+            (
+                "rocblas_cgemm_strided_batched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cublasCgemm3mStridedBatched",
+            (
+                "rocblas_cgemm_3m_strided_batched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cublasZgemmStridedBatched",
+            (
+                "rocblas_zgemm_strided_batched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "cublasHgemmStridedBatched",
+            (
+                "rocblas_hgemm_strided_batched",
+                CONV_MATH_FUNC,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cublasSgemm", ("rocblas_sgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgemm", ("rocblas_dgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgemm", ("rocblas_cgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasZgemm", ("rocblas_zgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasHgemm", ("rocblas_hgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasSsyrk", ("rocblas_ssyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyrk", ("rocblas_dsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyrk", ("rocblas_csyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyrk", ("rocblas_zsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCherk", ("rocblas_cherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZherk", ("rocblas_zherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyr2k", ("rocblas_ssyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyr2k", ("rocblas_dsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyr2k", ("rocblas_csyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyr2k", ("rocblas_zyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsyrkx", ("rocblas_ssyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyrkx", ("rocblas_dsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyrkx", ("rocblas_csyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyrkx", ("rocblas_zsyrkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCher2k", ("rocblas_cher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher2k", ("rocblas_zher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCherkx", ("rocblas_cherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZherkx", ("rocblas_zherkx", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSsymm", ("rocblas_ssymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsymm", ("rocblas_dsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsymm", ("rocblas_csymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsymm", ("rocblas_zsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChemm", ("rocblas_chemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhemm", ("rocblas_zhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrsm", ("rocblas_strsm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDtrsm", ("rocblas_dtrsm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCtrsm", ("rocblas_ctrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrsm", ("rocblas_ztrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasStrsmBatched",
+            ("rocblas_strsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsmBatched",
+            ("rocblas_dtrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsmBatched",
+            ("rocblas_ctrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsmBatched",
+            ("rocblas_ztrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasStrmm", ("rocblas_strmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrmm", ("rocblas_dtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrmm", ("rocblas_ctrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrmm", ("rocblas_ztrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSgeam", ("rocblas_sgeam", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgeam", ("rocblas_dgeam", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasCgeam", ("rocblas_cgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZgeam", ("rocblas_zgeam", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSgetrfBatched",
+            ("rocblas_sgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgetrfBatched",
+            ("rocblas_dgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgetrfBatched",
+            ("rocblas_cgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgetrfBatched",
+            ("rocblas_zgetrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgetriBatched",
+            ("rocblas_sgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgetriBatched",
+            ("rocblas_dgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgetriBatched",
+            ("rocblas_cgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgetriBatched",
+            ("rocblas_zgetri_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgetrsBatched",
+            ("rocblas_sgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgetrsBatched",
+            ("rocblas_dgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgetrsBatched",
+            ("rocblas_cgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgetrsBatched",
+            ("rocblas_zgetrs_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrsmBatched",
+            ("rocblas_strsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsmBatched",
+            ("rocblas_dtrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsmBatched",
+            ("rocblas_ctrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsmBatched",
+            ("rocblas_dtrsm_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSmatinvBatched",
+            ("rocblas_smatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDmatinvBatched",
+            ("rocblas_dmatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCmatinvBatched",
+            ("rocblas_cmatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZmatinvBatched",
+            ("rocblas_zmatinv_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgeqrfBatched",
+            ("rocblas_sgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgeqrfBatched",
+            ("rocblas_dgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgeqrfBatched",
+            ("rocblas_cgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgeqrfBatched",
+            ("rocblas_zgeqrf_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgelsBatched",
+            ("rocblas_sgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgelsBatched",
+            ("rocblas_dgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgelsBatched",
+            ("rocblas_cgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgelsBatched",
+            ("rocblas_zgels_batched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSdgmm", ("rocblas_sdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDdgmm", ("rocblas_ddgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCdgmm", ("rocblas_cdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZdgmm", ("rocblas_zdgmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStpttr", ("rocblas_stpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtpttr", ("rocblas_dtpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtpttr", ("rocblas_ctpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtpttr", ("rocblas_ztpttr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasStrttp", ("rocblas_strttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDtrttp", ("rocblas_dtrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCtrttp", ("rocblas_ctrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZtrttp", ("rocblas_ztrttp", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCreate_v2", ("rocblas_create_handle", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDestroy_v2", ("rocblas_destroy_handle", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasGetVersion_v2",
+            ("rocblas_get_version", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSetStream", ("rocblas_set_stream", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetStream", ("rocblas_get_stream", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasSetStream_v2", ("rocblas_set_stream", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasGetStream_v2", ("rocblas_get_stream", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasGetPointerMode",
+            ("rocblas_get_pointer_mode", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasSetPointerMode",
+            ("rocblas_set_pointer_mode", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasGetPointerMode_v2",
+            ("rocblas_get_pointer_mode", CONV_MATH_FUNC, API_BLAS),
+        ),
+        (
+            "cublasSetPointerMode_v2",
+            ("rocblas_set_pointer_mode", CONV_MATH_FUNC, API_BLAS),
+        ),
+        ("cublasSgemv_v2", ("rocblas_sgemv", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgemv_v2", ("rocblas_dgemv", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCgemv_v2",
+            ("rocblas_cgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemv_v2",
+            ("rocblas_zgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSgbmv_v2",
+            ("rocblas_sgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDgbmv_v2",
+            ("rocblas_dgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgbmv_v2",
+            ("rocblas_cgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgbmv_v2",
+            ("rocblas_zgbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrmv_v2",
+            ("rocblas_strmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrmv_v2",
+            ("rocblas_dtrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrmv_v2",
+            ("rocblas_ctrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrmv_v2",
+            ("rocblas_ztrmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStbmv_v2",
+            ("rocblas_stbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtbmv_v2",
+            ("rocblas_dtbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtbmv_v2",
+            ("rocblas_ctbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtbmv_v2",
+            ("rocblas_ztbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStpmv_v2",
+            ("rocblas_stpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtpmv_v2",
+            ("rocblas_dtpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtpmv_v2",
+            ("rocblas_ctpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtpmv_v2",
+            ("rocblas_ztpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrsv_v2",
+            ("rocblas_strsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsv_v2",
+            ("rocblas_dtrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsv_v2",
+            ("rocblas_ctrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsv_v2",
+            ("rocblas_ztrsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStpsv_v2",
+            ("rocblas_stpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtpsv_v2",
+            ("rocblas_dtpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtpsv_v2",
+            ("rocblas_ctpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtpsv_v2",
+            ("rocblas_ztpsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStbsv_v2",
+            ("rocblas_stbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtbsv_v2",
+            ("rocblas_dtbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtbsv_v2",
+            ("rocblas_ctbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtbsv_v2",
+            ("rocblas_ztbsv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsymv_v2",
+            ("rocblas_ssymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsymv_v2",
+            ("rocblas_dsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsymv_v2",
+            ("rocblas_csymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsymv_v2",
+            ("rocblas_zsymv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChemv_v2",
+            ("rocblas_chemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhemv_v2",
+            ("rocblas_zhemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsbmv_v2",
+            ("rocblas_ssbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsbmv_v2",
+            ("rocblas_dsbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChbmv_v2",
+            ("rocblas_chbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhbmv_v2",
+            ("rocblas_zhbmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSspmv_v2",
+            ("rocblas_sspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDspmv_v2",
+            ("rocblas_dspmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChpmv_v2",
+            ("rocblas_chpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhpmv_v2",
+            ("rocblas_zhpmv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSger_v2", ("rocblas_sger", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDger_v2", ("rocblas_dger", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCgeru_v2",
+            ("rocblas_cgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgerc_v2",
+            ("rocblas_cergc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgeru_v2",
+            ("rocblas_zgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgerc_v2",
+            ("rocblas_zgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSsyr_v2", ("rocblas_ssyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDsyr_v2", ("rocblas_dsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCsyr_v2", ("rocblas_csyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZsyr_v2", ("rocblas_zsyr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCher_v2", ("rocblas_cher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZher_v2", ("rocblas_zher", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSspr_v2", ("rocblas_sspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDspr_v2", ("rocblas_dspr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasChpr_v2", ("rocblas_chpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasZhpr_v2", ("rocblas_zhpr", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasSsyr2_v2",
+            ("rocblas_ssyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsyr2_v2",
+            ("rocblas_dsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyr2_v2",
+            ("rocblas_csyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsyr2_v2",
+            ("rocblas_zsyr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCher2_v2",
+            ("rocblas_cher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZher2_v2",
+            ("rocblas_zher2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSspr2_v2",
+            ("rocblas_sspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDspr2_v2",
+            ("rocblas_dspr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChpr2_v2",
+            ("rocblas_chpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhpr2_v2",
+            ("rocblas_zhpr2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSgemm_v2", ("rocblas_sgemm", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDgemm_v2", ("rocblas_dgemm", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCgemm_v2",
+            ("rocblas_cgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemm3m",
+            ("rocblas_cgemm_3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCgemm3mEx",
+            ("rocblas_cgemm_3mex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemm_v2",
+            ("rocblas_zgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZgemm3m",
+            ("rocblas_zgemm_3m", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        # NB: The function rocblas_sgemmex doesn't actually exist in
+        # rocblas, as of 2018-12-05
+        (
+            "cublasSgemmEx",
+            ("rocblas_sgemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasGemmEx", ("rocblas_gemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasCgemmEx",
+            ("rocblas_cgemmex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasUint8gemmBias",
+            ("rocblas_uint8gemmbias", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsyrk_v2",
+            ("rocblas_ssyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsyrk_v2",
+            ("rocblas_dsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyrk_v2",
+            ("rocblas_csyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsyrk_v2",
+            ("rocblas_zsyrk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyrkEx",
+            ("rocblas_csyrkex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyrk3mEx",
+            ("rocblas_csyrk3mex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCherk_v2",
+            ("rocblas_cherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCherkEx",
+            ("rocblas_cherkex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCherk3mEx",
+            ("rocblas_cherk3mex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZherk_v2",
+            ("rocblas_zherk", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsyr2k_v2",
+            ("rocblas_ssyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsyr2k_v2",
+            ("rocblas_dsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsyr2k_v2",
+            ("rocblas_csyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsyr2k_v2",
+            ("rocblas_zsyr2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCher2k_v2",
+            ("rocblas_cher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZher2k_v2",
+            ("rocblas_zher2k", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSsymm_v2",
+            ("rocblas_ssymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDsymm_v2",
+            ("rocblas_dsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsymm_v2",
+            ("rocblas_csymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZsymm_v2",
+            ("rocblas_zsymm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasChemm_v2",
+            ("rocblas_chemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZhemm_v2",
+            ("rocblas_zhemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrsm_v2",
+            ("rocblas_strsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrsm_v2",
+            ("rocblas_dtrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrsm_v2",
+            ("rocblas_ctrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrsm_v2",
+            ("rocblas_ztrsm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasStrmm_v2",
+            ("rocblas_strmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDtrmm_v2",
+            ("rocblas_dtrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCtrmm_v2",
+            ("rocblas_ctrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZtrmm_v2",
+            ("rocblas_ztrmm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSnrm2_v2", ("rocblas_snrm2", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDnrm2_v2", ("rocblas_dnrm2", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasScnrm2_v2",
+            ("rocblas_scnrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDznrm2_v2",
+            ("rocblas_dznrm2", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasDotEx", ("rocblas_dotex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDotcEx", ("rocblas_dotcex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSdot_v2", ("rocblas_sdot", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDdot_v2", ("rocblas_ddot", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCdotu_v2",
+            ("rocblas_cdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCdotc_v2",
+            ("rocblas_cdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZdotu_v2",
+            ("rocblas_zdotu", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZdotc_v2",
+            ("rocblas_zdotc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasScalEx", ("rocblas_scalex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSscal_v2", ("rocblas_sscal", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDscal_v2", ("rocblas_dscal", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCscal_v2",
+            ("rocblas_cscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCsscal_v2",
+            ("rocblas_csscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZscal_v2",
+            ("rocblas_zcsal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZdscal_v2",
+            ("rocblas_zdscal", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasAxpyEx", ("rocblas_axpyex", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasSaxpy_v2", ("rocblas_saxpy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDaxpy_v2", ("rocblas_daxpy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCaxpy_v2",
+            ("rocblas_caxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZaxpy_v2",
+            ("rocblas_zaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasScopy_v2", ("rocblas_scopy", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDcopy_v2", ("rocblas_dcopy", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCcopy_v2",
+            ("rocblas_ccopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZcopy_v2",
+            ("rocblas_zcopy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSswap_v2", ("rocblas_sswap", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDswap_v2", ("rocblas_dswap", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasCswap_v2",
+            ("rocblas_cswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZswap_v2",
+            ("rocblas_zswap", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasIsamax_v2", ("rocblas_isamax", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamax_v2", ("rocblas_idamax", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasIcamax_v2",
+            ("rocblas_icamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasIzamax_v2",
+            ("rocblas_izamax", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasIsamin_v2", ("rocblas_isamin", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasIdamin_v2", ("rocblas_idamin", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasIcamin_v2",
+            ("rocblas_icamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasIzamin_v2",
+            ("rocblas_izamin", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSasum_v2", ("rocblas_sasum", CONV_MATH_FUNC, API_BLAS)),
+        ("cublasDasum_v2", ("rocblas_dasum", CONV_MATH_FUNC, API_BLAS)),
+        (
+            "cublasScasum_v2",
+            ("rocblas_scasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDzasum_v2",
+            ("rocblas_dzasum", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasSrot_v2", ("rocblas_srot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasDrot_v2", ("rocblas_drot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        ("cublasCrot_v2", ("rocblas_crot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasCsrot_v2",
+            ("rocblas_csrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        ("cublasZrot_v2", ("rocblas_zrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED)),
+        (
+            "cublasZdrot_v2",
+            ("rocblas_zdrot", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSrotg_v2",
+            ("rocblas_srotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDrotg_v2",
+            ("rocblas_drotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasCrotg_v2",
+            ("rocblas_crotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasZrotg_v2",
+            ("rocblas_zrotg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSrotm_v2",
+            ("rocblas_srotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDrotm_v2",
+            ("rocblas_drotm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasSrotmg_v2",
+            ("rocblas_srotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "cublasDrotmg_v2",
+            ("rocblas_drotmg", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED),
+        ),
+        (
+            "CURAND_STATUS_SUCCESS",
+            ("HIPRAND_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_VERSION_MISMATCH",
+            ("HIPRAND_STATUS_VERSION_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_NOT_INITIALIZED",
+            ("HIPRAND_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_ALLOCATION_FAILED",
+            ("HIPRAND_STATUS_ALLOCATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_TYPE_ERROR",
+            ("HIPRAND_STATUS_TYPE_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_OUT_OF_RANGE",
+            ("HIPRAND_STATUS_OUT_OF_RANGE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_LENGTH_NOT_MULTIPLE",
+            ("HIPRAND_STATUS_LENGTH_NOT_MULTIPLE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED",
+            (
+                "HIPRAND_STATUS_DOUBLE_PRECISION_REQUIRED",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+            ),
+        ),
+        (
+            "CURAND_STATUS_LAUNCH_FAILURE",
+            ("HIPRAND_STATUS_LAUNCH_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_PREEXISTING_FAILURE",
+            ("HIPRAND_STATUS_PREEXISTING_FAILURE", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_INITIALIZATION_FAILED",
+            ("HIPRAND_STATUS_INITIALIZATION_FAILED", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_ARCH_MISMATCH",
+            ("HIPRAND_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_STATUS_INTERNAL_ERROR",
+            ("HIPRAND_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        ("CURAND_RNG_TEST", ("HIPRAND_RNG_TEST", CONV_NUMERIC_LITERAL, API_RAND)),
+        (
+            "mtgp32dc_params_fast_11213",
+            ("mtgp32dc_params_fast_11213", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_DEFAULT",
+            ("HIPRAND_RNG_PSEUDO_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_XORWOW",
+            ("HIPRAND_RNG_PSEUDO_XORWOW", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_MRG32K3A",
+            ("HIPRAND_RNG_PSEUDO_MRG32K3A", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_MTGP32",
+            ("HIPRAND_RNG_PSEUDO_MTGP32", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_MT19937",
+            ("HIPRAND_RNG_PSEUDO_MT19937", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_PSEUDO_PHILOX4_32_10",
+            ("HIPRAND_RNG_PSEUDO_PHILOX4_32_10", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_DEFAULT",
+            ("HIPRAND_RNG_QUASI_DEFAULT", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SOBOL32",
+            ("HIPRAND_RNG_QUASI_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SCRAMBLED_SOBOL32",
+            ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL32", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SOBOL64",
+            ("HIPRAND_RNG_QUASI_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "CURAND_RNG_QUASI_SCRAMBLED_SOBOL64",
+            ("HIPRAND_RNG_QUASI_SCRAMBLED_SOBOL64", CONV_NUMERIC_LITERAL, API_RAND),
+        ),
+        (
+            "curand_ORDERING_PSEUDO_BEST",
+            (
+                "HIPRAND_ORDERING_PSEUDO_BEST",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_ORDERING_PSEUDO_DEFAULT",
+            (
+                "HIPRAND_ORDERING_PSEUDO_DEFAULT",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_ORDERING_PSEUDO_SEEDED",
+            (
+                "HIPRAND_ORDERING_PSEUDO_SEEDED",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_ORDERING_QUASI_DEFAULT",
+            (
+                "HIPRAND_ORDERING_QUASI_DEFAULT",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_DIRECTION_VECTORS_32_JOEKUO6",
+            (
+                "HIPRAND_DIRECTION_VECTORS_32_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6",
+            (
+                "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_DIRECTION_VECTORS_64_JOEKUO6",
+            (
+                "HIPRAND_DIRECTION_VECTORS_64_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6",
+            (
+                "HIPRAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6",
+                CONV_NUMERIC_LITERAL,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_CHOOSE_BEST",
+            ("HIPRAND_CHOOSE_BEST", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_ITR",
+            ("HIPRAND_ITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_KNUTH",
+            ("HIPRAND_KNUTH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_HITR",
+            ("HIPRAND_HITR", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curand_M1", ("HIPRAND_M1", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
+        ("curand_M2", ("HIPRAND_M2", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED)),
+        (
+            "curand_BINARY_SEARCH",
+            ("HIPRAND_BINARY_SEARCH", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_DISCRETE_GAUSS",
+            ("HIPRAND_DISCRETE_GAUSS", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_REJECTION",
+            ("HIPRAND_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_DEVICE_API",
+            ("HIPRAND_DEVICE_API", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_FAST_REJECTION",
+            ("HIPRAND_FAST_REJECTION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_3RD",
+            ("HIPRAND_3RD", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_DEFINITION",
+            ("HIPRAND_DEFINITION", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_POISSON",
+            ("HIPRAND_POISSON", CONV_NUMERIC_LITERAL, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandCreateGenerator", ("hiprandCreateGenerator", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandCreateGeneratorHost",
+            ("hiprandCreateGeneratorHost", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandCreatePoissonDistribution",
+            ("hiprandCreatePoissonDistribution", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandDestroyDistribution",
+            ("hiprandDestroyDistribution", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandDestroyGenerator",
+            ("hiprandDestroyGenerator", CONV_MATH_FUNC, API_RAND),
+        ),
+        ("curandGenerate", ("hiprandGenerate", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandGenerateLogNormal",
+            ("hiprandGenerateLogNormal", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandGenerateLogNormalDouble",
+            ("hiprandGenerateLogNormalDouble", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandGenerateLongLong",
+            ("hiprandGenerateLongLong", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curandGenerateNormal", ("hiprandGenerateNormal", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandGenerateNormalDouble",
+            ("hiprandGenerateNormalDouble", CONV_MATH_FUNC, API_RAND),
+        ),
+        ("curandGeneratePoisson", ("hiprandGeneratePoisson", CONV_MATH_FUNC, API_RAND)),
+        ("curandGenerateSeeds", ("hiprandGenerateSeeds", CONV_MATH_FUNC, API_RAND)),
+        ("curandGenerateUniform", ("hiprandGenerateUniform", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandGenerateUniformDouble",
+            ("hiprandGenerateUniformDouble", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandGetDirectionVectors32",
+            ("hiprandGetDirectionVectors32", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandGetDirectionVectors64",
+            ("hiprandGetDirectionVectors64", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandGetProperty",
+            ("hiprandGetProperty", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandGetScrambleConstants32",
+            (
+                "hiprandGetScrambleConstants32",
+                CONV_MATH_FUNC,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curandGetScrambleConstants64",
+            (
+                "hiprandGetScrambleConstants64",
+                CONV_MATH_FUNC,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("curandGetVersion", ("hiprandGetVersion", CONV_MATH_FUNC, API_RAND)),
+        (
+            "curandSetGeneratorOffset",
+            ("hiprandSetGeneratorOffset", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandSetGeneratorOrdering",
+            ("hiprandSetGeneratorOrdering", CONV_MATH_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curandSetPseudoRandomGeneratorSeed",
+            ("hiprandSetPseudoRandomGeneratorSeed", CONV_MATH_FUNC, API_RAND),
+        ),
+        (
+            "curandSetQuasiRandomGeneratorDimensions",
+            ("hiprandSetQuasiRandomGeneratorDimensions", CONV_MATH_FUNC, API_RAND),
+        ),
+        ("curandSetStream", ("hiprandSetStream", CONV_MATH_FUNC, API_RAND)),
+        ("curand", ("hiprand", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand4", ("hiprand4", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_init", ("hiprand_init", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_log_normal", ("hiprand_log_normal", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_log_normal_double",
+            ("hiprand_log_normal_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_log_normal2", ("hiprand_log_normal2", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_log_normal2_double",
+            ("hiprand_log_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_log_normal4", ("hiprand_log_normal4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_log_normal4_double",
+            ("hiprand_log_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        (
+            "curand_mtgp32_single",
+            ("hiprand_mtgp32_single", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        (
+            "curand_mtgp32_single_specific",
+            (
+                "hiprand_mtgp32_single_specific",
+                CONV_DEVICE_FUNC,
+                API_RAND,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        (
+            "curand_mtgp32_specific",
+            ("hiprand_mtgp32_specific", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("curand_normal", ("hiprand_normal", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curandMakeMTGP32Constants",
+            ("hiprandMakeMTGP32Constants", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        (
+            "curandMakeMTGP32KernelState",
+            ("hiprandMakeMTGP32KernelState", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_normal_double", ("hiprand_normal_double", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_normal2", ("hiprand_normal2", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_normal2_double",
+            ("hiprand_normal2_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_normal4", ("hiprand_normal4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_normal4_double",
+            ("hiprand_normal4_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_uniform", ("hiprand_uniform", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_uniform_double",
+            ("hiprand_uniform_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        (
+            "curand_uniform2_double",
+            ("hiprand_uniform2_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_uniform4", ("hiprand_uniform4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_uniform4_double",
+            ("hiprand_uniform4_double", CONV_DEVICE_FUNC, API_RAND),
+        ),
+        ("curand_discrete", ("hiprand_discrete", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_discrete4", ("hiprand_discrete4", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_poisson", ("hiprand_poisson", CONV_DEVICE_FUNC, API_RAND)),
+        ("curand_poisson4", ("hiprand_poisson4", CONV_DEVICE_FUNC, API_RAND)),
+        (
+            "curand_Philox4x32_10",
+            ("hiprand_Philox4x32_10", CONV_DEVICE_FUNC, API_RAND, HIP_UNSUPPORTED),
+        ),
+        ("mtgp32_kernel_params", ("mtgp32_kernel_params_t", CONV_MATH_FUNC, API_RAND)),
+        ("CUFFT_FORWARD", ("HIPFFT_FORWARD", CONV_NUMERIC_LITERAL, API_BLAS)),
+        ("CUFFT_INVERSE", ("HIPFFT_BACKWARD", CONV_NUMERIC_LITERAL, API_BLAS)),
+        (
+            "CUFFT_COMPATIBILITY_DEFAULT",
+            (
+                "HIPFFT_COMPATIBILITY_DEFAULT",
+                CONV_NUMERIC_LITERAL,
+                API_BLAS,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cufftResult_t", ("hipfftResult_t", CONV_TYPE, API_FFT)),
+        ("cufftResult", ("hipfftResult", CONV_TYPE, API_FFT)),
+        ("CUFFT_SUCCESS", ("HIPFFT_SUCCESS", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_INVALID_PLAN", ("HIPFFT_INVALID_PLAN", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_ALLOC_FAILED", ("HIPFFT_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_INVALID_TYPE", ("HIPFFT_INVALID_TYPE", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "CUFFT_INVALID_VALUE",
+            ("HIPFFT_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_INTERNAL_ERROR",
+            ("HIPFFT_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        ("CUFFT_EXEC_FAILED", ("HIPFFT_EXEC_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_SETUP_FAILED", ("HIPFFT_SETUP_FAILED", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_INVALID_SIZE", ("HIPFFT_INVALID_SIZE", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "CUFFT_UNALIGNED_DATA",
+            ("HIPFFT_UNALIGNED_DATA", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_INCOMPLETE_PARAMETER_LIST",
+            ("HIPFFT_INCOMPLETE_PARAMETER_LIST", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_INVALID_DEVICE",
+            ("HIPFFT_INVALID_DEVICE", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        ("CUFFT_PARSE_ERROR", ("HIPFFT_PARSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_NO_WORKSPACE", ("HIPFFT_NO_WORKSPACE", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "CUFFT_NOT_IMPLEMENTED",
+            ("HIPFFT_NOT_IMPLEMENTED", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        (
+            "CUFFT_LICENSE_ERROR",
+            ("HIPFFT_LICENSE_ERROR", CONV_NUMERIC_LITERAL, API_FFT, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUFFT_NOT_SUPPORTED",
+            ("HIPFFT_NOT_SUPPORTED", CONV_NUMERIC_LITERAL, API_FFT),
+        ),
+        ("cufftType_t", ("hipfftType_t", CONV_TYPE, API_FFT)),
+        ("cufftType", ("hipfftType", CONV_TYPE, API_FFT)),
+        ("CUFFT_R2C", ("HIPFFT_R2C", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_C2R", ("HIPFFT_C2R", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_C2C", ("HIPFFT_C2C", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_D2Z", ("HIPFFT_D2Z", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_Z2D", ("HIPFFT_Z2D", CONV_NUMERIC_LITERAL, API_FFT)),
+        ("CUFFT_Z2Z", ("HIPFFT_Z2Z", CONV_NUMERIC_LITERAL, API_FFT)),
+        (
+            "cufftCompatibility_t",
+            ("hipfftCompatibility_t", CONV_TYPE, API_FFT, HIP_UNSUPPORTED),
+        ),
+        (
+            "cufftCompatibility",
+            ("hipfftCompatibility", CONV_TYPE, API_FFT, HIP_UNSUPPORTED),
+        ),
+        (
+            "CUFFT_COMPATIBILITY_FFTW_PADDING",
+            (
+                "HIPFFT_COMPATIBILITY_FFTW_PADDING",
+                CONV_NUMERIC_LITERAL,
+                API_FFT,
+                HIP_UNSUPPORTED,
+            ),
+        ),
+        ("cufftReal", ("hipfftReal", CONV_TYPE, API_FFT)),
+        ("cufftDoubleReal", ("hipfftDoubleReal", CONV_TYPE, API_FFT)),
+        ("cufftComplex", ("hipfftComplex", CONV_TYPE, API_FFT)),
+        ("cufftDoubleComplex", ("hipfftDoubleComplex", CONV_TYPE, API_FFT)),
+        ("cufftHandle", ("hipfftHandle", CONV_TYPE, API_FFT)),
+        ("cufftPlan1d", ("hipfftPlan1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftPlan2d", ("hipfftPlan2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftPlan3d", ("hipfftPlan3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftPlanMany", ("hipfftPlanMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlan1d", ("hipfftMakePlan1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlan2d", ("hipfftMakePlan2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlan3d", ("hipfftMakePlan3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlanMany", ("hipfftMakePlanMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftMakePlanMany64", ("hipfftMakePlanMany64", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSizeMany64", ("hipfftGetSizeMany64", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimate1d", ("hipfftEstimate1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimate2d", ("hipfftEstimate2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimate3d", ("hipfftEstimate3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftEstimateMany", ("hipfftEstimateMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftCreate", ("hipfftCreate", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize1d", ("hipfftGetSize1d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize2d", ("hipfftGetSize2d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize3d", ("hipfftGetSize3d", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSizeMany", ("hipfftGetSizeMany", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetSize", ("hipfftGetSize", CONV_MATH_FUNC, API_FFT)),
+        ("cufftSetWorkArea", ("hipfftSetWorkArea", CONV_MATH_FUNC, API_FFT)),
+        (
+            "cufftSetAutoAllocation",
+            ("hipfftSetAutoAllocation", CONV_MATH_FUNC, API_FFT),
+        ),
+        ("cufftExecC2C", ("hipfftExecC2C", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecR2C", ("hipfftExecR2C", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecC2R", ("hipfftExecC2R", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecZ2Z", ("hipfftExecZ2Z", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecD2Z", ("hipfftExecD2Z", CONV_MATH_FUNC, API_FFT)),
+        ("cufftExecZ2D", ("hipfftExecZ2D", CONV_MATH_FUNC, API_FFT)),
+        ("cufftSetStream", ("hipfftSetStream", CONV_MATH_FUNC, API_FFT)),
+        ("cufftDestroy", ("hipfftDestroy", CONV_MATH_FUNC, API_FFT)),
+        ("cufftGetVersion", ("hipfftGetVersion", CONV_MATH_FUNC, API_FFT)),
+        (
+            "cufftGetProperty",
+            ("hipfftGetProperty", CONV_MATH_FUNC, API_FFT, HIP_UNSUPPORTED),
+        ),
+        ("nvrtcResult", ("hiprtcResult", CONV_TYPE, API_RTC)),
+        ("NVRTC_SUCCESS", ("HIPRTC_SUCCESS", CONV_TYPE, API_RTC)),
+        (
+            "NVRTC_ERROR_OUT_OF_MEMORY",
+            ("HIPRTC_ERROR_OUT_OF_MEMORY", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_PROGRAM_CREATION_FAILURE",
+            ("HIPRTC_ERROR_PROGRAM_CREATION_FAILURE", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_INVALID_INPUT",
+            ("HIPRTC_ERROR_INVALID_INPUT", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_INVALID_PROGRAM",
+            ("HIPRTC_ERROR_INVALID_PROGRAM", CONV_TYPE, API_RTC),
+        ),
+        ("NVRTC_ERROR_COMPILATION", ("HIPRTC_ERROR_COMPILATION", CONV_TYPE, API_RTC)),
+        (
+            "NVRTC_ERROR_BUILTIN_OPERATION_FAILURE",
+            ("HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION",
+            ("HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID",
+            ("HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID", CONV_TYPE, API_RTC),
+        ),
+        (
+            "NVRTC_ERROR_INTERNAL_ERROR",
+            ("HIPRTC_ERROR_INTERNAL_ERROR", CONV_TYPE, API_RTC),
+        ),
+        ("nvrtcGetErrorString", ("hiprtcGetErrorString", CONV_JIT, API_RTC)),
+        ("nvrtcVersion", ("hiprtcVersion", CONV_JIT, API_RTC)),
+        ("nvrtcProgram", ("hiprtcProgram", CONV_TYPE, API_RTC)),
+        ("nvrtcAddNameExpression", ("hiprtcAddNameExpression", CONV_JIT, API_RTC)),
+        ("nvrtcCompileProgram", ("hiprtcCompileProgram", CONV_JIT, API_RTC)),
+        ("nvrtcCreateProgram", ("hiprtcCreateProgram", CONV_JIT, API_RTC)),
+        ("nvrtcDestroyProgram", ("hiprtcDestroyProgram", CONV_JIT, API_RTC)),
+        ("nvrtcGetLoweredName", ("hiprtcGetLoweredName", CONV_JIT, API_RTC)),
+        ("nvrtcGetProgramLog", ("hiprtcGetProgramLog", CONV_JIT, API_RTC)),
+        ("nvrtcGetProgramLogSize", ("hiprtcGetProgramLogSize", CONV_JIT, API_RTC)),
+        ("nvrtcGetPTX", ("hiprtcGetCode", CONV_JIT, API_RTC)),
+        ("nvrtcGetPTXSize", ("hiprtcGetCodeSize", CONV_JIT, API_RTC)),
+        ("thrust::cuda", ("thrust::hip", CONV_MATH_FUNC, API_BLAS)),
+        ("cub::", ("hipcub::", CONV_MATH_FUNC, API_BLAS)),
+        ("nvtxMark", ("roctxMark", CONV_OTHER, API_ROCTX)),
+        ("nvtxMarkA", ("roctxMarkA", CONV_OTHER, API_ROCTX)),
+        ("nvtxRangePushA", ("roctxRangePushA", CONV_OTHER, API_ROCTX)),
+        ("nvtxRangePop", ("roctxRangePop", CONV_OTHER, API_ROCTX)),
+    ]
+)
+
+CUDA_SPARSE_MAP = collections.OrderedDict(
+    [
+        ("cusparseStatus_t", ("hipsparseStatus_t", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseHandle_t", ("hipsparseHandle_t", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseOperation_t", ("hipsparseOperation_t", CONV_TYPE, API_SPARSE)),
+        (
+            "cusparseCreateMatDescr",
+            ("hipsparseCreateMatDescr", CONV_MATH_FUNC, API_SPARSE),
+        ),
+        ("cusparseCreate", ("hipsparseCreate", CONV_MATH_FUNC, API_SPARSE)),
+        (
+            "cusparseDestroyMatDescr",
+            ("hipsparseDestroyMatDescr", CONV_MATH_FUNC, API_SPARSE),
+        ),
+        ("cusparseDestroy", ("hipsparseDestroy", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseXcoo2csr", ("hipsparseXcoo2csr", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseMatDescr_t", ("hipsparseMatDescr_t", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseScsrmm2", ("hipsparseScsrmm2", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseDcsrmm2", ("hipsparseDcsrmm2", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseScsrmm", ("hipsparseScsrmm", CONV_MATH_FUNC, API_SPARSE)),
+        ("cusparseDcsrmm", ("hipsparseDcsrmm", CONV_MATH_FUNC, API_SPARSE)),
+        (
+            "cusparseXcsrsort_bufferSizeExt",
+            ("hipsparseXcsrsort_bufferSizeExt", CONV_MATH_FUNC, API_SPARSE),
+        ),
+        ("cusparseXcsrsort", ("hipsparseXcsrsort", CONV_MATH_FUNC, API_SPARSE)),
+        (
+            "cusparseXcoosort_bufferSizeExt",
+            ("hipsparseXcoosort_bufferSizeExt", CONV_MATH_FUNC, API_SPARSE),
+        ),
+        (
+            "cusparseXcoosortByRow",
+            ("hipsparseXcoosortByRow", CONV_MATH_FUNC, API_SPARSE),
+        ),
+        ("cusparseSetStream", ("hipsparseSetStream", CONV_MATH_FUNC, API_SPARSE)),
+        (
+            "cusparseCreateIdentityPermutation",
+            ("hipsparseCreateIdentityPermutation", CONV_MATH_FUNC, API_SPARSE),
+        ),
+        (
+            "cusparseSetMatIndexBase",
+            ("hipsparseSetMatIndexBase", CONV_MATH_FUNC, API_SPARSE),
+        ),
+        ("cusparseSetMatType", ("hipsparseSetMatType", CONV_MATH_FUNC, API_SPARSE)),
+        (
+            "CUSPARSE_STATUS_SUCCESS",
+            ("HIPSPARSE_STATUS_SUCCESS", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_NOT_INITIALIZED",
+            ("HIPSPARSE_STATUS_NOT_INITIALIZED", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_ALLOC_FAILED",
+            ("HIPSPARSE_STATUS_ALLOC_FAILED", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_INVALID_VALUE",
+            ("HIPSPARSE_STATUS_INVALID_VALUE", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_MAPPING_ERROR",
+            ("HIPSPARSE_STATUS_MAPPING_ERROR", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_EXECUTION_FAILED",
+            ("HIPSPARSE_STATUS_EXECUTION_FAILED", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_INTERNAL_ERROR",
+            ("HIPSPARSE_STATUS_INTERNAL_ERROR", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED",
+            (
+                "HIPSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED",
+                CONV_NUMERIC_LITERAL,
+                API_SPARSE,
+            ),
+        ),
+        (
+            "CUSPARSE_STATUS_ARCH_MISMATCH",
+            ("HIPSPARSE_STATUS_ARCH_MISMATCH", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_STATUS_ZERO_PIVOT",
+            ("HIPSPARSE_STATUS_ZERO_PIVOT", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_OPERATION_TRANSPOSE",
+            ("HIPSPARSE_OPERATION_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_OPERATION_NON_TRANSPOSE",
+            ("HIPSPARSE_OPERATION_NON_TRANSPOSE", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE",
+            (
+                "HIPSPARSE_OPERATION_CONJUGATE_TRANSPOSE",
+                CONV_NUMERIC_LITERAL,
+                API_SPARSE,
+            ),
+        ),
+        (
+            "CUSPARSE_INDEX_BASE_ZERO",
+            ("HIPSPARSE_INDEX_BASE_ZERO", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_INDEX_BASE_ONE",
+            ("HIPSPARSE_INDEX_BASE_ONE", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+        (
+            "CUSPARSE_MATRIX_TYPE_GENERAL",
+            ("HIPSPARSE_MATRIX_TYPE_GENERAL", CONV_NUMERIC_LITERAL, API_SPARSE),
+        ),
+    ]
+)
+
+PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict(
+    [
+        ("USE_CUDA", ("USE_ROCM", API_PYTORCH)),
+        ("CUDA_VERSION", ("HIP_VERSION", API_PYTORCH)),
+        ("cudaHostAllocator", ("hipHostAllocator", API_PYTORCH)),
+        ("cudaDeviceAllocator", ("hipDeviceAllocator", API_PYTORCH)),
+        ("define MAX_NUM_BLOCKS 200", ("define MAX_NUM_BLOCKS 64", API_PYTORCH)),
+        ("cuda::CUDAGuard", ("hip::HIPGuardMasqueradingAsCUDA", API_PYTORCH)),
+        ("CUDAGuard", ("HIPGuardMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::OptionalCUDAGuard",
+            ("hip::OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        ("OptionalCUDAGuard", ("OptionalHIPGuardMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::CUDAStreamGuard",
+            ("hip::HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        ("CUDAStreamGuard", ("HIPStreamGuardMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::OptionalCUDAStreamGuard",
+            ("hip::OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "OptionalCUDAStreamGuard",
+            ("OptionalHIPStreamGuardMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        # Only get needs to be transformed this way; all the other ones can go
+        # straight to the normal versions hip::HIPCachingAllocator
+        (
+            "cuda::CUDACachingAllocator::get",
+            ("hip::HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH),
+        ),
+        (
+            "CUDACachingAllocator::get",
+            ("HIPCachingAllocatorMasqueradingAsCUDA::get", API_PYTORCH),
+        ),
+        (
+            "cuda::CUDACachingAllocator::recordStream",
+            (
+                "hip::HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
+        (
+            "CUDACachingAllocator::recordStream",
+            (
+                "HIPCachingAllocatorMasqueradingAsCUDA::recordStreamMasqueradingAsCUDA",
+                API_PYTORCH,
+            ),
+        ),
+        ("cuda::CUDAStream", ("hip::HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
+        ("CUDAStream", ("HIPStreamMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::getStreamFromPool",
+            ("hip::getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        ("getStreamFromPool", ("getStreamFromPoolMasqueradingAsCUDA", API_PYTORCH)),
+        (
+            "cuda::getDefaultCUDAStream",
+            ("hip::getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "getDefaultCUDAStream",
+            ("getDefaultHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "cuda::getCurrentCUDAStream",
+            ("hip::getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "getCurrentCUDAStream",
+            ("getCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "cuda::setCurrentCUDAStream",
+            ("hip::setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        (
+            "setCurrentCUDAStream",
+            ("setCurrentHIPStreamMasqueradingAsCUDA", API_PYTORCH),
+        ),
+        # TODO: Undo this special-case; see the header for motivation behind this
+        # hack.  It's VERY important this is only applied to PyTorch HIPify.
+        (
+            "c10/cuda/CUDAGuard.h",
+            ("ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h", API_PYTORCH),
+        ),
+        (
+            "c10/cuda/CUDACachingAllocator.h",
+            ("ATen/hip/impl/HIPCachingAllocatorMasqueradingAsCUDA.h", API_PYTORCH),
+        ),
+        (
+            "c10/cuda/CUDAStream.h",
+            ("ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h", API_PYTORCH),
+        ),
+        ("gloo/cuda.h", ("gloo/hip.h", API_PYTORCH)),
+        (
+            "gloo/cuda_allreduce_halving_doubling.h",
+            ("gloo/hip_allreduce_halving_doubling.h", API_PYTORCH),
+        ),
+        (
+            "gloo/cuda_allreduce_halving_doubling_pipelined.h",
+            ("gloo/hip_allreduce_halving_doubling_pipelined.h", API_PYTORCH),
+        ),
+        ("gloo/cuda_allreduce_ring.h", ("gloo/hip_allreduce_ring.h", API_PYTORCH)),
+        (
+            "gloo/cuda_broadcast_one_to_all.h",
+            ("gloo/hip_broadcast_one_to_all.h", API_PYTORCH),
+        ),
+        (
+            "gloo::CudaAllreduceHalvingDoublingPipelined",
+            ("gloo::HipAllreduceHalvingDoublingPipelined", API_PYTORCH),
+        ),
+        ("gloo::CudaBroadcastOneToAll", ("gloo::HipBroadcastOneToAll", API_PYTORCH)),
+        ("gloo::CudaHostWorkspace", ("gloo::HipHostWorkspace", API_PYTORCH)),
+        ("gloo::CudaDeviceWorkspace", ("gloo::HipDeviceWorkspace", API_PYTORCH)),
+    ]
+)
+
+CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict(
+    [
+        ("cuda_stream", ("hip_stream", API_CAFFE2)),
+        # if the header is a native hip folder (under hip directory),
+        # there is no need to add a hip path to it; the trie in hipify script
+        # takes this mapping order to forbid further replacement
+        ("/hip/", ("/hip/", API_CAFFE2)),
+        ("/context_gpu", ("/hip/context_gpu", API_CAFFE2)),
+        ("/common_gpu", ("/hip/common_gpu", API_CAFFE2)),
+        ("/mixed_utils", ("/hip/mixed_utils", API_CAFFE2)),
+        ("/operator_fallback_gpu", ("/hip/operator_fallback_gpu", API_CAFFE2)),
+        (
+            "/spatial_batch_norm_op_impl",
+            ("/hip/spatial_batch_norm_op_impl", API_CAFFE2),
+        ),
+        (
+            "/recurrent_network_executor_gpu",
+            ("/hip/recurrent_network_executor_gpu", API_CAFFE2),
+        ),
+        (
+            "/generate_proposals_op_util_nms_gpu",
+            ("/hip/generate_proposals_op_util_nms_gpu", API_CAFFE2),
+        ),
+        ("/max_pool_with_index_gpu", ("/hip/max_pool_with_index_gpu", API_CAFFE2)),
+        ("/THCCachingAllocator_gpu", ("/hip/THCCachingAllocator_gpu", API_CAFFE2)),
+        ("/top_k_heap_selection", ("/hip/top_k_heap_selection", API_CAFFE2)),
+        ("/top_k_radix_selection", ("/hip/top_k_radix_selection", API_CAFFE2)),
+        ("/GpuDefs", ("/hip/GpuDefs", API_CAFFE2)),
+        ("/GpuScanUtils", ("/hip/GpuScanUtils", API_CAFFE2)),
+        ("/GpuBitonicSort", ("/hip/GpuBitonicSort", API_CAFFE2)),
+        ("/math/reduce.cuh", ("/math/hip/reduce.cuh", API_CAFFE2)),
+        ("/gather_op.cuh", ("/hip/gather_op.cuh", API_CAFFE2)),
+        ("caffe2/core/common_cudnn.h", ("caffe2/core/hip/common_miopen.h", API_CAFFE2)),
+        ("REGISTER_CUDA_OPERATOR", ("REGISTER_HIP_OPERATOR", API_CAFFE2)),
+        ("CUDA_1D_KERNEL_LOOP", ("HIP_1D_KERNEL_LOOP", API_CAFFE2)),
+        ("CUDAContext", ("HIPContext", API_CAFFE2)),
+        ("CAFFE_CUDA_NUM_THREADS", ("CAFFE_HIP_NUM_THREADS", API_CAFFE2)),
+        ("HasCudaGPU", ("HasHipGPU", API_CAFFE2)),
+        ("__expf", ("expf", API_CAFFE2)),
+        ("CUBLAS_ENFORCE", ("ROCBLAS_ENFORCE", API_CAFFE2)),
+        ("CUBLAS_CHECK", ("ROCBLAS_CHECK", API_CAFFE2)),
+        ("cublas_handle", ("rocblashandle", API_CAFFE2)),
+        ("CURAND_ENFORCE", ("HIPRAND_ENFORCE", API_CAFFE2)),
+        ("CURAND_CHECK", ("HIPRAND_CHECK", API_CAFFE2)),
+        ("curandGenerateUniform", ("hiprandGenerateUniform", API_CAFFE2)),
+        ("curand_generator", ("hiprand_generator", API_CAFFE2)),
+        ("CaffeCudaGetDevice", ("CaffeHipGetDevice", API_CAFFE2)),
+        ("CUDA", ("HIP", API_CAFFE2)),
+        ("Cuda", ("Hip", API_CAFFE2)),
+        ("cuda_", ("hip_", API_CAFFE2)),
+        ("_cuda", ("_hip", API_CAFFE2)),
+        ("CUDNN", ("MIOPEN", API_CAFFE2)),
+        ("CuDNN", ("MIOPEN", API_CAFFE2)),
+        ("cudnn", ("miopen", API_CAFFE2)),
+        ("namespace cuda", ("namespace hip", API_CAFFE2)),
+        ("cuda::CUDAGuard", ("hip::HIPGuard", API_CAFFE2)),
+        ("cuda::OptionalCUDAGuard", ("hip::OptionalHIPGuard", API_CAFFE2)),
+        ("cuda::CUDAStreamGuard", ("hip::HIPStreamGuard", API_CAFFE2)),
+        ("cuda::OptionalCUDAStreamGuard", ("hip::OptionalHIPStreamGuard", API_CAFFE2)),
+        ("c10/cuda/CUDAGuard.h", ("c10/hip/HIPGuard.h", API_CAFFE2)),
+        ("gloo/cuda", ("gloo/hip", API_CAFFE2)),
+    ]
+)
+
+# We must tread very carefully here.  Blanket conversions like are done
+# in CAFFE2_SPECIFIC_MAPPINGS are not presently supported on PyTorch,
+# because a regex for CUDA will also match a filename like CUDAGuard.h,
+# but the HIPIFY script doesn't presently move the file and so the substitution
+# will be invalid.  Instead, we specifically list out every identifier
+# and file from c10/cuda which may be used externally, and do substitutions this
+# way.
+#
+# NB: if you want a transformation to ONLY apply to the c10/ directory,
+# put it as API_CAFFE2
+C10_MAPPINGS = collections.OrderedDict(
+    [
+        ("cuda::compat::", ("hip::compat::", API_C10)),
+        ("c10/cuda/CUDAException.h", ("c10/hip/HIPException.h", API_C10)),
+        ("c10/cuda/CUDAMacros.h", ("c10/hip/HIPMacros.h", API_C10)),
+        ("c10/cuda/CUDAMathCompat.h", ("c10/hip/HIPMathCompat.h", API_C10)),
+        ("c10/cuda/CUDAFunctions.h", ("c10/hip/HIPFunctions.h", API_C10)),
+        ("c10/cuda/CUDAStream.h", ("c10/hip/HIPStream.h", API_C10)),
+        ("c10/cuda/CUDACachingAllocator.h", ("c10/hip/HIPCachingAllocator.h", API_C10)),
+        ("c10/cuda/impl/CUDATest.h", ("c10/hip/impl/HIPTest.h", API_C10)),
+        ("c10/cuda/impl/CUDAGuardImpl.h", ("c10/hip/impl/HIPGuardImpl.h", API_C10)),
+        (
+            "c10/cuda/impl/cuda_cmake_macros.h",
+            ("c10/hip/impl/hip_cmake_macros.h", API_C10),
+        ),
+        ("C10_CUDA_CHECK", ("C10_HIP_CHECK", API_C10)),
+        ("C10_CUDA_CHECK_WARN", ("C10_HIP_CHECK_WARN", API_C10)),
+        ("c10::cuda", ("c10::hip", API_C10)),
+        ("cuda::CUDAStream", ("hip::HIPStream", API_C10)),
+        ("CUDAStream", ("HIPStream", API_C10)),
+        # This substitution is not permissible, because there's another copy of this
+        # function in torch/cuda.h
+        # ("cuda::device_count", ("hip::device_count", API_C10)),
+        ("cuda::current_device", ("hip::current_device", API_C10)),
+        ("cuda::set_device", ("hip::set_device", API_C10)),
+        ("cuda::getStreamFromPool", ("hip::getStreamFromPool", API_C10)),
+        ("getStreamFromPool", ("getStreamFromPool", API_C10)),
+        ("cuda::getDefaultCUDAStream", ("hip::getDefaultHIPStream", API_C10)),
+        ("getDefaultCUDAStream", ("getDefaultHIPStream", API_C10)),
+        ("cuda::getCurrentCUDAStream", ("hip::getCurrentHIPStream", API_C10)),
+        ("getCurrentCUDAStream", ("getCurrentHIPStream", API_C10)),
+        ("cuda::setCurrentCUDAStream", ("hip::setCurrentHIPStream", API_C10)),
+        ("setCurrentCUDAStream", ("setCurrentHIPStream", API_C10)),
+        ("cuda::CUDACachingAllocator", ("hip::HIPCachingAllocator", API_C10)),
+        ("CUDACachingAllocator", ("HIPCachingAllocator", API_C10)),
+    ]
+)
+
+# NB: C10 mappings are more specific than Caffe2 mappings, so run them
+# first
+CUDA_TO_HIP_MAPPINGS = [
+    CUDA_IDENTIFIER_MAP,
+    CUDA_TYPE_NAME_MAP,
+    CUDA_INCLUDE_MAP,
+    CUDA_SPARSE_MAP,
+    C10_MAPPINGS,
+    PYTORCH_SPECIFIC_MAPPINGS,
+    CAFFE2_SPECIFIC_MAPPINGS,
+]
diff --git a/tools/amd_build/pyHIPIFY/hipify_python.py b/torch/utils/hipify/hipify_python.py
similarity index 98%
rename from tools/amd_build/pyHIPIFY/hipify_python.py
rename to torch/utils/hipify/hipify_python.py
index 2a0de6b36667c..fe42e807e6efa 100755
--- a/tools/amd_build/pyHIPIFY/hipify_python.py
+++ b/torch/utils/hipify/hipify_python.py
@@ -32,9 +32,9 @@
 import sys
 import os
 
-from pyHIPIFY import constants
-from pyHIPIFY.cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS
-from pyHIPIFY.cuda_to_hip_mappings import MATH_TRANSPILATIONS
+from . import constants
+from .cuda_to_hip_mappings import CUDA_TO_HIP_MAPPINGS
+from .cuda_to_hip_mappings import MATH_TRANSPILATIONS
 
 # Hardcode the PyTorch template map
 """This dictionary provides the mapping from PyTorch kernel template types
@@ -366,9 +366,13 @@ def find_parentheses_group(input_string, start):
 
 
 def replace_math_functions(input_string):
-    """ FIXME: Temporarily replace std:: invocations of math functions with non-std:: versions to prevent linker errors
-        NOTE: This can lead to correctness issues when running tests, since the correct version of the math function (exp/expf) might not get called.
-        Plan is to remove this function once HIP supports std:: math function calls inside device code
+    """FIXME: Temporarily replace std:: invocations of math functions
+        with non-std:: versions to prevent linker errors NOTE: This
+        can lead to correctness issues when running tests, since the
+        correct version of the math function (exp/expf) might not get
+        called.  Plan is to remove this function once HIP supports
+        std:: math function calls inside device code
+
     """
     output_string = input_string
     for func in MATH_TRANSPILATIONS:

From 24242e86fab134d4993b19807a7cf3e3585c4117 Mon Sep 17 00:00:00 2001
From: Pritam Damania <pritam.damania@fb.com>
Date: Mon, 7 Oct 2019 17:37:58 -0700
Subject: [PATCH 103/113] Ensure NCCL error handling code is disabled for NCCL
 versions < 2.4 (#27124)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27124

ncclCommAbort() and ncclGetAsyncError() were two APIs added in NCCL
2.4 to detect errors in NCCL communicators. These were used as part of
ProcesGroupNCCL and we also enforced that only NCCL versions 2.4+ were
supported. Although, there is still legitimate use for older NCCL versions and
hence we should still support those.

For that purpose, in this change I've ensured we disable NCCL error checking
for versions < 2.4.
ghstack-source-id: 91452959

Test Plan:
1) Test with 2.4.8
2) Test with 2.2.13
3) unit tests.

Differential Revision: D17178988

fbshipit-source-id: 5dc44b5f7b4b00466c67fd452315f1d4f5c47698
---
 caffe2/CMakeLists.txt                         | 10 ++++++
 test/common_distributed.py                    | 12 +++++++
 test/test_c10d.py                             |  8 ++++-
 torch/CMakeLists.txt                          |  2 --
 torch/csrc/cuda/nccl.cpp                      | 18 +++++-----
 torch/csrc/cuda/nccl.h                        | 14 ++++----
 torch/csrc/cuda/python_nccl.cpp               | 18 +++++-----
 torch/lib/c10d/NCCLUtils.hpp                  | 35 ++++++++++++++-----
 torch/lib/c10d/ProcessGroupNCCL.cpp           |  4 +++
 torch/lib/c10d/test/CMakeLists.txt            |  3 +-
 .../c10d/test/ProcessGroupNCCLErrorsTest.cpp  | 13 +++++--
 11 files changed, 97 insertions(+), 40 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index e0c1b1222dccf..37949fdd44652 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -470,6 +470,11 @@ if (NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
     )
   endif()
 
+  if (USE_NCCL)
+    list(APPEND TORCH_SRCS
+      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp)
+  endif()
+
   if (NOT INTERN_BUILD_MOBILE)
     list(APPEND TORCH_SRCS
       ${TORCH_SRC_DIR}/csrc/api/src/jit.cpp
@@ -612,6 +617,11 @@ ELSE()
   add_library(torch ${Caffe2_CPU_SRCS})
 ENDIF()
 
+if (USE_NCCL)
+  target_link_libraries(torch PRIVATE __caffe2_nccl)
+  target_compile_definitions(torch PRIVATE USE_NCCL)
+endif()
+
 
 # ==========================================================
 # formerly-libtorch flags
diff --git a/test/common_distributed.py b/test/common_distributed.py
index cecd1ae6557f9..dd5cd715fd874 100644
--- a/test/common_distributed.py
+++ b/test/common_distributed.py
@@ -64,6 +64,18 @@ def requires_gloo():
         "c10d was not compiled with the Gloo backend",
     )
 
+def requires_nccl_version(version, msg):
+    if not c10d.is_nccl_available():
+        return unittest.skip(
+            "c10d was not compiled with the NCCL backend",
+        )
+    else:
+        return unittest.skipIf(
+            torch.cuda.nccl.version() < version,
+            "Requires NCCL version greater than or equal to: {}, found: {}, reason: {}".format(
+                version,
+                torch.cuda.nccl.version(), msg),
+        )
 
 def requires_nccl():
     return unittest.skipUnless(
diff --git a/test/test_c10d.py b/test/test_c10d.py
index d850aba95d56a..e5d70ba53df22 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -26,7 +26,7 @@
 from torch.nn.parallel import DistributedDataParallel
 
 from common_distributed import MultiProcessTestCase, \
-    requires_gloo, requires_nccl, \
+    requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, skip_for_known_issues, get_timeout
 from common_utils import TestCase, load_tests, run_tests
 from common_utils import retry_on_address_already_in_use_error
@@ -2993,6 +2993,7 @@ def _run_all_reduce(self, pg):
         pg.allreduce(torch.rand(10).cuda(self.rank))
 
     @requires_nccl()
+    @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_not_multigpu
     def test_nccl_errors_nonblocking(self):
         store = c10d.FileStore(self.file_name, self.world_size)
@@ -3031,26 +3032,31 @@ def _test_nccl_errors_blocking(self, func):
             func()
 
     @requires_nccl()
+    @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_not_multigpu
     def test_nccl_errors_blocking_clean_exit(self):
         self._test_nccl_errors_blocking(lambda : sys.exit(0))
 
     @requires_nccl()
+    @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_not_multigpu
     def test_nccl_errors_blocking_nonzero_exit(self):
         self._test_nccl_errors_blocking(lambda : sys.exit(1))
 
     @requires_nccl()
+    @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_not_multigpu
     def test_nccl_errors_blocking_abort(self):
         self._test_nccl_errors_blocking(lambda : os.abort())
 
     @requires_nccl()
+    @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_not_multigpu
     def test_nccl_errors_blocking_sigkill(self):
         self._test_nccl_errors_blocking(lambda : os.kill(os.getpid(), signal.SIGKILL))
 
     @requires_nccl()
+    @requires_nccl_version(2400, "Need NCCL 2.4+ for error checking")
     @skip_if_not_multigpu
     def test_nccl_errors_blocking_sigterm(self):
         self._test_nccl_errors_blocking(lambda : os.kill(os.getpid(), signal.SIGTERM))
diff --git a/torch/CMakeLists.txt b/torch/CMakeLists.txt
index ad0b5a2930066..7372180d24d80 100644
--- a/torch/CMakeLists.txt
+++ b/torch/CMakeLists.txt
@@ -247,10 +247,8 @@ endif()
 
 if (USE_NCCL)
     list(APPEND TORCH_PYTHON_SRCS
-      ${TORCH_SRC_DIR}/csrc/cuda/nccl.cpp
       ${TORCH_SRC_DIR}/csrc/cuda/python_nccl.cpp)
     list(APPEND TORCH_PYTHON_COMPILE_DEFINITIONS USE_NCCL)
-    list(APPEND TORCH_PYTHON_LINK_LIBRARIES __caffe2_nccl)
 endif()
 
 # In the most recent CMake versions, a new 'TRANSFORM' subcommand of 'list' allows much of the boilerplate of defining the lists
diff --git a/torch/csrc/cuda/nccl.cpp b/torch/csrc/cuda/nccl.cpp
index 93c0aa4344dd0..f421d1c6fe73d 100644
--- a/torch/csrc/cuda/nccl.cpp
+++ b/torch/csrc/cuda/nccl.cpp
@@ -70,7 +70,7 @@ using device_list = std::vector<int>;
 static std::unordered_map<device_list, NcclCommList, torch::hash<device_list>>
     _communicators;
 
-ArrayRef<ncclComm_t> _get_communicators(TensorList inputs) {
+ArrayRef<ncclComm_t> get_communicators(TensorList inputs) {
   static auto get_device = [](const at::Tensor& t) -> int {
     return t.get_device();
   };
@@ -81,7 +81,7 @@ ArrayRef<ncclComm_t> _get_communicators(TensorList inputs) {
   return it->second.ref();
 }
 
-ncclDataType_t _get_data_type(const Tensor& t) {
+ncclDataType_t get_data_type(const Tensor& t) {
   if (t.type().backend() != Backend::CUDA) {
     throw std::runtime_error("Unconvertible NCCL type");
   }
@@ -105,7 +105,7 @@ ncclDataType_t _get_data_type(const Tensor& t) {
   }
 }
 
-void _check_inputs(
+void check_inputs(
     TensorList inputs,
     TensorList outputs,
     int input_multiplier,
@@ -232,12 +232,12 @@ void broadcast(
     const comm_list& user_comms) {
 #ifdef USE_NCCL
   using namespace torch::cuda::nccl::detail;
-  _check_inputs(tensors, tensors, 1, 1);
-  ncclDataType_t data_type = _get_data_type(tensors[0]);
+  check_inputs(tensors, tensors, 1, 1);
+  ncclDataType_t data_type = get_data_type(tensors[0]);
   int64_t numel = tensors[0].numel();
 
   AutoNcclGroup nccl_group_guard;
-  const auto comms = user_comms.empty() ? _get_communicators(tensors)
+  const auto comms = user_comms.empty() ? get_communicators(tensors)
                                         : ArrayRef<ncclComm_t>(user_comms);
 
   at::cuda::OptionalCUDAGuard device_guard;
@@ -276,14 +276,14 @@ void reduce(
   TORCH_CHECK(
       root >= 0 && static_cast<size_t>(root) < inputs.size(), "invalid root");
 
-  _check_inputs(inputs, outputs, 1, 1);
+  check_inputs(inputs, outputs, 1, 1);
   const auto len = inputs.size();
 
-  ncclDataType_t data_type = _get_data_type(inputs[0]);
+  ncclDataType_t data_type = get_data_type(inputs[0]);
 
   const auto count = inputs[0].numel();
   AutoNcclGroup nccl_group_guard;
-  auto comms_ref = user_comms.empty() ? _get_communicators(inputs)
+  auto comms_ref = user_comms.empty() ? get_communicators(inputs)
                                       : ArrayRef<ncclComm_t>(user_comms);
 
   at::cuda::OptionalCUDAGuard device_guard;
diff --git a/torch/csrc/cuda/nccl.h b/torch/csrc/cuda/nccl.h
index 66aa4aecec6f5..1e92850043dcc 100644
--- a/torch/csrc/cuda/nccl.h
+++ b/torch/csrc/cuda/nccl.h
@@ -42,31 +42,31 @@ struct AutoNcclGroup {
   }
 };
 
-at::ArrayRef<ncclComm_t> _get_communicators(at::TensorList inputs);
-void _check_inputs(
+TORCH_API at::ArrayRef<ncclComm_t> get_communicators(at::TensorList inputs);
+TORCH_API void check_inputs(
     at::TensorList inputs,
     at::TensorList outputs,
     int input_multiplier,
     int output_multiplier);
-ncclDataType_t _get_data_type(const at::Tensor& t);
+TORCH_API ncclDataType_t get_data_type(const at::Tensor& t);
 
 } // namespace detail
 
 using comm_list = std::vector<ncclComm_t>;
 using stream_list = std::vector<c10::optional<at::cuda::CUDAStream>>;
 
-std::uint64_t version();
+TORCH_API std::uint64_t version();
 
 bool is_available(at::TensorList tensors);
 
-void broadcast(
+TORCH_API void broadcast(
     at::TensorList tensors,
     const stream_list& streams = {},
     const comm_list& user_comms = {});
 
 size_t get_max_count();
 
-void reduce(
+TORCH_API void reduce(
     const std::vector<at::Tensor>& inputs,
     std::vector<at::Tensor>& outputs,
     int32_t root = 0,
@@ -74,7 +74,7 @@ void reduce(
     const stream_list& streams = {},
     const comm_list& user_comms = {});
 
-void reduce(
+TORCH_API void reduce(
     std::vector<at::Tensor>& inputs,
     int32_t root = 0,
     int32_t op = ncclSum,
diff --git a/torch/csrc/cuda/python_nccl.cpp b/torch/csrc/cuda/python_nccl.cpp
index 092f3c0afe157..3601134322f39 100644
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@@ -185,14 +185,14 @@ PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
   auto user_comms = unpack_comms(_comms, inputs.size());
 
   with_no_gil([&] {
-    _check_inputs(inputs, outputs, 1, 1);
+    check_inputs(inputs, outputs, 1, 1);
     size_t len = inputs.size();
 
-    ncclDataType_t data_type = _get_data_type(inputs[0]);
+    ncclDataType_t data_type = get_data_type(inputs[0]);
 
     int64_t count = inputs[0].numel();
     AutoNcclGroup nccl_group_guard;
-    auto comms = user_comms.empty() ? _get_communicators(inputs)
+    auto comms = user_comms.empty() ? get_communicators(inputs)
                                     : ArrayRef<ncclComm_t>(user_comms);
     at::cuda::OptionalCUDAGuard device_guard;
     for (size_t i = 0; i < len; i++) {
@@ -265,13 +265,13 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
 
   with_no_gil([&] {
     size_t len = inputs.size();
-    _check_inputs(inputs, outputs, len, 1);
+    check_inputs(inputs, outputs, len, 1);
 
-    ncclDataType_t data_type = _get_data_type(inputs[0]);
+    ncclDataType_t data_type = get_data_type(inputs[0]);
 
     int64_t count = inputs[0].numel();
     AutoNcclGroup nccl_group_guard;
-    auto comms = user_comms.empty() ? _get_communicators(inputs)
+    auto comms = user_comms.empty() ? get_communicators(inputs)
                                     : ArrayRef<ncclComm_t>(user_comms);
     at::cuda::OptionalCUDAGuard device_guard;
     for (size_t i = 0; i < len; i++) {
@@ -327,13 +327,13 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
 
   with_no_gil([&] {
     size_t len = inputs.size();
-    _check_inputs(inputs, outputs, 1, len);
+    check_inputs(inputs, outputs, 1, len);
 
-    ncclDataType_t data_type = _get_data_type(inputs[0]);
+    ncclDataType_t data_type = get_data_type(inputs[0]);
 
     int64_t count = inputs[0].numel() / len;
     AutoNcclGroup nccl_group_guard;
-    auto comms = user_comms.empty() ? _get_communicators(inputs)
+    auto comms = user_comms.empty() ? get_communicators(inputs)
                                     : ArrayRef<ncclComm_t>(user_comms);
     at::cuda::OptionalCUDAGuard device_guard;
     for (size_t i = 0; i < len; i++) {
diff --git a/torch/lib/c10d/NCCLUtils.hpp b/torch/lib/c10d/NCCLUtils.hpp
index 96e44b2f45624..395b8cab49d62 100644
--- a/torch/lib/c10d/NCCLUtils.hpp
+++ b/torch/lib/c10d/NCCLUtils.hpp
@@ -1,15 +1,17 @@
 #pragma once
 
-#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
-    (NCCL_MINOR < 4)
-#error "Need NCCL version 2.4+"
-#elif defined(NCCL_MAJOR) && (NCCL_MAJOR < 2)
-#error "Need NCCL version 2.4+"
-#endif
-
 #include <nccl.h>
 #include <memory>
 
+// Error checking is enabled only for NCCL versions 2.4+ since ncclCommAbort()
+// and ncclCommGetAsyncError() are not supported in earlier versions.
+#if defined(NCCL_MAJOR) && (NCCL_MAJOR == 2) && defined(NCCL_MINOR) && \
+    (NCCL_MINOR >= 4)
+#define ENABLE_NCCL_ERROR_CHECKING
+#elif defined(NCCL_MAJOR) && (NCCL_MAJOR >= 3)
+#define ENABLE_NCCL_ERROR_CHECKING
+#endif
+
 #define C10D_NCCL_CHECK(cmd)                                                \
   do {                                                                      \
     ncclResult_t error = cmd;                                               \
@@ -35,9 +37,14 @@ class NCCLComm {
 
   ~NCCLComm() noexcept(false) {
     if (ncclComm_ && !aborted_) {
-      // Use ncclCommAbort instead of ncclCommDestroy here since ncclCommDestroy
-      // could block forever waiting for work to complete on the communicator.
+#ifdef ENABLE_NCCL_ERROR_CHECKING
+      // Use ncclCommAbort instead of ncclCommDestroy here since
+      // ncclCommDestroy could block forever waiting for work to complete on
+      // the communicator.
       ncclCommAbort();
+#else
+      C10D_NCCL_CHECK(::ncclCommDestroy(ncclComm_));
+#endif
     }
   }
 
@@ -78,6 +85,7 @@ class NCCLComm {
   }
 
   void ncclCommAbort() {
+#ifdef ENABLE_NCCL_ERROR_CHECKING
     if (aborted_) {
       // Should not abort twice.
       return;
@@ -91,6 +99,10 @@ class NCCLComm {
     if (ncclAsyncErr_ == ncclSuccess) {
       ncclAsyncErr_ = ncclSystemError;
     }
+#else
+    // This is a NOOP, if error checks are disabled.
+    return;
+#endif
   }
 
   bool isAborted() const {
@@ -98,11 +110,16 @@ class NCCLComm {
   }
 
   ncclResult_t checkForNcclError() {
+#ifdef ENABLE_NCCL_ERROR_CHECKING
     if (ncclAsyncErr_ != ncclSuccess) {
       return ncclAsyncErr_;
     }
     C10D_NCCL_CHECK(ncclCommGetAsyncError(ncclComm_, &ncclAsyncErr_));
     return ncclAsyncErr_;
+#else
+    // Always return success, if error checks are disabled.
+    return ncclSuccess;
+#endif
   }
 
  protected:
diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp
index 469c39245e70b..f26b42b57c978 100644
--- a/torch/lib/c10d/ProcessGroupNCCL.cpp
+++ b/torch/lib/c10d/ProcessGroupNCCL.cpp
@@ -246,14 +246,18 @@ ProcessGroupNCCL::ProcessGroupNCCL(
         std::string(NCCL_BLOCKING_WAIT));
   }
 
+#ifdef ENABLE_NCCL_ERROR_CHECKING
   ncclCommWatchdogThread_ =
       std::thread(&ProcessGroupNCCL::ncclCommWatchdog, this);
+#endif
 }
 
 ProcessGroupNCCL::~ProcessGroupNCCL() {
   terminateWatchdog_.store(true);
   watchdogCV_.notify_one();
+#ifdef ENABLE_NCCL_ERROR_CHECKING
   ncclCommWatchdogThread_.join();
+#endif
 }
 
 void ProcessGroupNCCL::ncclCommWatchdog() {
diff --git a/torch/lib/c10d/test/CMakeLists.txt b/torch/lib/c10d/test/CMakeLists.txt
index 5ed12a4ef3f99..ff34bc952887f 100644
--- a/torch/lib/c10d/test/CMakeLists.txt
+++ b/torch/lib/c10d/test/CMakeLists.txt
@@ -23,7 +23,8 @@ if(USE_CUDA)
   endif()
   if(USE_C10D_NCCL)
     c10d_add_test(ProcessGroupNCCLTest.cpp c10d c10d_cuda_test)
-    c10d_add_test(ProcessGroupNCCLErrorsTest.cpp c10d c10d_cuda_test gtest_main)
+    c10d_add_test(ProcessGroupNCCLErrorsTest.cpp c10d c10d_cuda_test
+        gtest_main)
   endif()
 else()
   if(USE_C10D_GLOO)
diff --git a/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp
index 4541111bbfd2c..9745324cfb011 100644
--- a/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp
+++ b/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp
@@ -3,9 +3,12 @@
 #include <c10d/test/CUDATest.hpp>
 #include <c10d/test/TestUtils.hpp>
 #include <gtest/gtest.h>
+#include <torch/csrc/cuda/nccl.h>
 
 using namespace c10d::test;
 
+constexpr int kNcclErrorHandlingVersion = 2400;
+
 class WorkNCCLSimulateErrors : public c10d::ProcessGroupNCCL::WorkNCCL {
  public:
   WorkNCCLSimulateErrors(
@@ -71,8 +74,14 @@ class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL {
 class ProcessGroupNCCLErrorsTest : public ::testing::Test {
  protected:
   bool skipTest() {
-    // Skip test if no cuda devices found.
-    return cudaNumDevices() == 0;
+    if (cudaNumDevices() == 0) {
+      return true;
+    }
+#ifdef USE_C10D_NCCL
+    return torch::cuda::nccl::version() < kNcclErrorHandlingVersion;
+#else
+    return false;
+#endif
   }
 
   void SetUp() override {

From 23f2fb0aecb1cb2d99fcca307b1748807c432d26 Mon Sep 17 00:00:00 2001
From: Pavel Belevich <pbelevich@fb.com>
Date: Mon, 7 Oct 2019 18:39:38 -0700
Subject: [PATCH 104/113] #include <stdexcept> into flat_hash_map.h (#27478)

Summary:
Fixing https://github.com/pytorch/pytorch/issues/27266

In general we should not rely on transitively included headers, we should implicitly include all headers if their members are used in the source file.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27478

Differential Revision: D17799522

Pulled By: pbelevich

fbshipit-source-id: 5818394a212c947cfac3a6cf042af9ebb8b9d9a0
---
 c10/util/flat_hash_map.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c10/util/flat_hash_map.h b/c10/util/flat_hash_map.h
index 500336b5ed87d..c513a61ab4899 100644
--- a/c10/util/flat_hash_map.h
+++ b/c10/util/flat_hash_map.h
@@ -21,6 +21,7 @@
 #include <iterator>
 #include <utility>
 #include <type_traits>
+#include <stdexcept>
 
 #ifndef _MSC_VER
 #pragma GCC diagnostic push

From 84e2dc692ab50125fbd871b4e5f8a8893f4bd5db Mon Sep 17 00:00:00 2001
From: James Reed <jamesreed@fb.com>
Date: Mon, 7 Oct 2019 20:03:09 -0700
Subject: [PATCH 105/113] Fix broken name mangling

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27511

Test Plan: Imported from OSS

Differential Revision: D17801185

Pulled By: jamesr66a

fbshipit-source-id: 3eaa9542a445c9401f3f96e11138ec09b0d8350a
---
 torch/csrc/jit/script/compiler.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 4f8031a3e9615..531f58240e66b 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -3258,6 +3258,7 @@ c10::QualifiedName CompilationUnit::mangle(
       newAtom.reserve(atom.size());
       // Append the part of the name up to the end of the prefix
       newAtom.append(atom, 0, pos);
+      newAtom.append(manglePrefix);
       newAtom.append(std::to_string(mangleIndex_++));
       atom = newAtom;
       return QualifiedName(atoms);

From 1b5df374417087e69c54f0f670d466cd25b4a3f0 Mon Sep 17 00:00:00 2001
From: svcscm <svcscm@fb.com>
Date: Tue, 8 Oct 2019 01:07:05 -0700
Subject: [PATCH 106/113] Updating submodules

Summary:
GitHub commits:

https://github.com/facebook/fbthrift/commit/e80ecd1d63c956ed34b257fbd1aaef73ef8eb781
https://github.com/facebook/proxygen/commit/6c7a36b1b3f2825fd30ba00c708ec5ceaa5db760
https://github.com/facebookincubator/mvfst/commit/875046204325f9bd8cc5343b98a8fa4b99187a3c
https://github.com/facebook/proxygen/commit/442d7def679c297427f5d0b679685db92fe3d28c
https://github.com/facebook/wangle/commit/c138dc3d2c0c4f4f68ab4931e44b87a6becb194c
https://github.com/facebookincubator/fizz/commit/3833f10989711256704260a01e0c9f7d1c33e468
https://github.com/facebookincubator/katran/commit/6fc473d5304985aa31d351c6305904e80af4b614
https://github.com/pytorch/fbgemm/commit/82d259dade58e53775a534f88b7b48e760f09a64

Test Plan: n/a

Reviewed By: 2d2d2d2d2d

fbshipit-source-id: 7834a4a8620d0ab9b60060e0abadfba457fb2890
---
 third_party/fbgemm | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/fbgemm b/third_party/fbgemm
index 8786c0819029c..82d259dade58e 160000
--- a/third_party/fbgemm
+++ b/third_party/fbgemm
@@ -1 +1 @@
-Subproject commit 8786c0819029c076b0e28320e880ba3ac192ea8b
+Subproject commit 82d259dade58e53775a534f88b7b48e760f09a64

From 34662f77c665ee8a8369a7d23fe0560992de92e9 Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Tue, 8 Oct 2019 01:56:28 -0700
Subject: [PATCH 107/113] Revert D17159707: [pytorch][PR] [ONNX] Fixed Select
 symbolic to export slice when index = negative one

Test Plan: revert-hammer

Differential Revision:
D17159707

Original commit changeset: 2c3b27542108

fbshipit-source-id: accce910abdbe13270d0f592810a48b1dabe4b01
---
 caffe2/onnx/backend.cc                        | 18 ++++++++------
 .../TestOperators.test_dyn_arange.expect      | 12 +++++-----
 .../expect/TestOperators.test_full.expect     | 24 +++++++++----------
 ...TestOperators.test_upsample_nearest.expect | 24 +++++++++----------
 .../TestOperators.test_view_flatten.expect    | 24 +++++++++----------
 test/onnx/test_onnx_opset.py                  | 16 ++++++-------
 test/onnx/test_pytorch_onnx_caffe2.py         |  1 +
 test/onnx/test_pytorch_onnx_onnxruntime.py    |  3 ++-
 torch/onnx/symbolic_opset9.py                 | 14 +++++------
 9 files changed, 70 insertions(+), 66 deletions(-)

diff --git a/caffe2/onnx/backend.cc b/caffe2/onnx/backend.cc
index 284be62b96273..e0943294580c1 100644
--- a/caffe2/onnx/backend.cc
+++ b/caffe2/onnx/backend.cc
@@ -709,16 +709,20 @@ Caffe2Ops Caffe2Backend::CreateGather(
   std::vector<std::string> inputs;
   inputs.emplace_back(node.input(0));
   inputs.emplace_back(node.input(1));
-
-  auto axis = onnx_node->attributes.get<int64_t>("axis", 0L);
-  caffe2::Argument arg_axis;
-  arg_axis.set_name("axis");
-  arg_axis.set_i(axis);
-
   std::vector<std::string> outputs;
   outputs.emplace_back(node.output(0));
 
-  BuildOperator(c2_op, "Gather", inputs, outputs, {arg_axis});
+  auto axis = onnx_node->attributes.get<int64_t>("axis", 0L);
+  if (axis == 0) {
+    BuildOperator(c2_op, "Gather", inputs, outputs);
+  } else if (axis == 1) {
+    BuildOperator(c2_op, "BatchGather", inputs, outputs);
+  } else {
+    CAFFE_THROW(
+        "Caffe2 only supports Gather with axis being 0 or 1, ",
+        "whereas axis is ",
+        axis);
+  }
 
   return ret;
 }
diff --git a/test/onnx/expect/TestOperators.test_dyn_arange.expect b/test/onnx/expect/TestOperators.test_dyn_arange.expect
index 2dd557d6af4db..552114be962bb 100644
--- a/test/onnx/expect/TestOperators.test_dyn_arange.expect
+++ b/test/onnx/expect/TestOperators.test_dyn_arange.expect
@@ -3,12 +3,7 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
-    input: "0"
     output: "1"
-    op_type: "Shape"
-  }
-  node {
-    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -20,8 +15,13 @@ graph {
     }
   }
   node {
-    input: "1"
+    input: "0"
+    output: "2"
+    op_type: "Shape"
+  }
+  node {
     input: "2"
+    input: "1"
     output: "3"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/expect/TestOperators.test_full.expect b/test/onnx/expect/TestOperators.test_full.expect
index c11d3e94c1f8a..d13667ef84757 100644
--- a/test/onnx/expect/TestOperators.test_full.expect
+++ b/test/onnx/expect/TestOperators.test_full.expect
@@ -3,12 +3,7 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
-    input: "0"
     output: "1"
-    op_type: "Shape"
-  }
-  node {
-    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -20,8 +15,13 @@ graph {
     }
   }
   node {
-    input: "1"
+    input: "0"
+    output: "2"
+    op_type: "Shape"
+  }
+  node {
     input: "2"
+    input: "1"
     output: "3"
     op_type: "Gather"
     attribute {
@@ -31,12 +31,7 @@ graph {
     }
   }
   node {
-    input: "0"
     output: "4"
-    op_type: "Shape"
-  }
-  node {
-    output: "5"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -48,8 +43,13 @@ graph {
     }
   }
   node {
-    input: "4"
+    input: "0"
+    output: "5"
+    op_type: "Shape"
+  }
+  node {
     input: "5"
+    input: "4"
     output: "6"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/expect/TestOperators.test_upsample_nearest.expect b/test/onnx/expect/TestOperators.test_upsample_nearest.expect
index baf27592822b0..16451536e1819 100644
--- a/test/onnx/expect/TestOperators.test_upsample_nearest.expect
+++ b/test/onnx/expect/TestOperators.test_upsample_nearest.expect
@@ -3,12 +3,7 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
-    input: "input"
     output: "1"
-    op_type: "Shape"
-  }
-  node {
-    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -20,8 +15,13 @@ graph {
     }
   }
   node {
-    input: "1"
+    input: "input"
+    output: "2"
+    op_type: "Shape"
+  }
+  node {
     input: "2"
+    input: "1"
     output: "3"
     op_type: "Gather"
     attribute {
@@ -74,12 +74,7 @@ graph {
     op_type: "Floor"
   }
   node {
-    input: "input"
     output: "9"
-    op_type: "Shape"
-  }
-  node {
-    output: "10"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -91,8 +86,13 @@ graph {
     }
   }
   node {
-    input: "9"
+    input: "input"
+    output: "10"
+    op_type: "Shape"
+  }
+  node {
     input: "10"
+    input: "9"
     output: "11"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect
index 9909c19c15265..4d81920b95f75 100644
--- a/test/onnx/expect/TestOperators.test_view_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_view_flatten.expect
@@ -3,12 +3,7 @@ producer_name: "pytorch"
 producer_version: "1.3"
 graph {
   node {
-    input: "0"
     output: "1"
-    op_type: "Shape"
-  }
-  node {
-    output: "2"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -20,8 +15,13 @@ graph {
     }
   }
   node {
-    input: "1"
+    input: "0"
+    output: "2"
+    op_type: "Shape"
+  }
+  node {
     input: "2"
+    input: "1"
     output: "3"
     op_type: "Gather"
     attribute {
@@ -31,12 +31,7 @@ graph {
     }
   }
   node {
-    input: "0"
     output: "4"
-    op_type: "Shape"
-  }
-  node {
-    output: "5"
     op_type: "Constant"
     attribute {
       name: "value"
@@ -48,8 +43,13 @@ graph {
     }
   }
   node {
-    input: "4"
+    input: "0"
+    output: "5"
+    op_type: "Shape"
+  }
+  node {
     input: "5"
+    input: "4"
     output: "6"
     op_type: "Gather"
     attribute {
diff --git a/test/onnx/test_onnx_opset.py b/test/onnx/test_onnx_opset.py
index cfcd510b5940f..0efe3dd875d63 100644
--- a/test/onnx/test_onnx_opset.py
+++ b/test/onnx/test_onnx_opset.py
@@ -186,7 +186,7 @@ def forward(self, x):
                    "attributes" : []}]
         ops = {9 : ops_9, 10 : ops_10}
         x = torch.randn(3)
-        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9])
+        check_onnx_opsets_operator(MyModule(), x, ops, opset_versions=[9, 10])
 
         class DynamicSliceModel(torch.jit.ScriptModule):
             @torch.jit.script_method
@@ -196,7 +196,6 @@ def forward(self, x):
         ops_9 = [{"op_name" : "Constant"},
                  {"op_name" : "Constant"},
                  {"op_name" : "Shape"},
-                 {"op_name": "Constant"},
                  {"op_name" : "Gather",
                  "attributes" : [{"name" : "axis", "i" : 0, "type" : 2}]},
                  {"op_name" : "Unsqueeze",
@@ -209,7 +208,6 @@ def forward(self, x):
         ops_10 = [{"op_name" : "Constant"},
                   {"op_name" : "Constant"},
                   {"op_name" : "Shape"},
-                  {"op_name": "Constant"},
                   {"op_name" : "Gather",
                    "attributes" : [{"name" : "axis", "i" : 0, "type" : 2}]},
                   {"op_name" : "Unsqueeze",
@@ -286,11 +284,11 @@ def forward(self, x):
                 return torch.nn.functional.interpolate(x,
                                                        size=size,
                                                        mode='nearest')
-        ops_9 = [{"op_name" : "Shape"},
-                 {"op_name" : "Constant"},
-                 {"op_name" : "Gather"},
+        ops_9 = [{"op_name" : "Constant"},
                  {"op_name" : "Shape"},
+                 {"op_name" : "Gather"},
                  {"op_name" : "Constant"},
+                 {"op_name" : "Shape"},
                  {"op_name" : "Gather"},
                  {"op_name" : "Constant"},
                  {"op_name" : "Mul"},
@@ -309,11 +307,11 @@ def forward(self, x):
                  {"op_name" : "Upsample",
                   "attributes" :
                   [{"name": "mode", "s": ("nearest").encode(), "type": 3}]}]
-        ops_10 = [{"op_name" : "Shape"},
-                  {"op_name" : "Constant"},
-                  {"op_name" : "Gather"},
+        ops_10 = [{"op_name" : "Constant"},
                   {"op_name" : "Shape"},
+                  {"op_name" : "Gather"},
                   {"op_name" : "Constant"},
+                  {"op_name" : "Shape"},
                   {"op_name" : "Gather"},
                   {"op_name" : "Constant"},
                   {"op_name" : "Mul"},
diff --git a/test/onnx/test_pytorch_onnx_caffe2.py b/test/onnx/test_pytorch_onnx_caffe2.py
index fac6715f167c5..32ff6d97c56f6 100644
--- a/test/onnx/test_pytorch_onnx_caffe2.py
+++ b/test/onnx/test_pytorch_onnx_caffe2.py
@@ -1306,6 +1306,7 @@ def forward(self, x):
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_model_test(NegSlice(), train=False, input=(x,), batch_size=BATCH_SIZE, use_gpu=False)
 
+    @unittest.skip('https://github.com/pytorch/pytorch/issues/10984')
     @skipIfUnsupportedOpsetVersion([10])
     def test_neg_slice_large_negone(self):
         class NegSlice(torch.nn.Module):
diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py
index 11af41143c314..3c741fb8845d9 100644
--- a/test/onnx/test_pytorch_onnx_onnxruntime.py
+++ b/test/onnx/test_pytorch_onnx_onnxruntime.py
@@ -345,11 +345,12 @@ def forward(self, x):
     def test_slice_neg_large(self):
         class NegSlice(torch.nn.Module):
             def forward(self, x):
-                return x[:, :, -3:-1, :, -1]
+                return x[:, :, :, :, -3]
 
         x = torch.randn(3, 4, 5, 6, 7)
         self.run_test(NegSlice(), x)
 
+    @unittest.skip('https://github.com/pytorch/pytorch/issues/10984')
     def test_slice_neg_large_negone(self):
         class NegSlice(torch.nn.Module):
             def forward(self, x):
diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py
index a174788005ec8..19dd546ecc832 100644
--- a/torch/onnx/symbolic_opset9.py
+++ b/torch/onnx/symbolic_opset9.py
@@ -394,13 +394,13 @@ def split_with_sizes(g, self, split_sizes, dim):
 
 @parse_args('v', 'i', 'v')
 def select(g, self, dim, index):
-    index = sym_help._maybe_get_scalar(index)
-    if (not sym_help._is_value(index)) and (index < 0):
-        if index == -1:
-            end_index = 9223372036854775807
-        else:
-            end_index = index + 1
-        slice_node = sym_help._slice_helper(g, self, axes=[dim], starts=[index], ends=[end_index])
+    if dim > 1:
+        # TODO: this is a temporary hack because of the implementation details
+        # of Gather in caffe2. We need to change this as soon as possible.
+        # TODO: this breaks if index == -1
+        index_val = _parse_arg(index, 'i')
+        slice_node = sym_help._slice_helper(g, self, axes=[dim],
+                                            starts=[index_val], ends=[index_val + 1])
         return g.op("Squeeze", slice_node, axes_i=[dim])
     else:
         return g.op("Gather", self, index, axis_i=dim)

From 2c51e0659b79dce39b27efb9d5e6755524eeb21c Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 8 Oct 2019 08:57:16 -0700
Subject: [PATCH 108/113] Roll master to 1.4.0 (#27374)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27374

Signed-off-by: Edward Z. Yang <ezyang@fb.com>

Test Plan: Imported from OSS

Differential Revision: D17809770

Pulled By: ezyang

fbshipit-source-id: 75bd97426494a7bbbf08f9bce7563d35871443d8
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index c4150fa5bb510..895d424d5d4b2 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.3.0a0
+1.4.0a0

From e63addfff6570db98c2918e10a6010340cb29d39 Mon Sep 17 00:00:00 2001
From: Swati Rallapalli <swatir@fb.com>
Date: Tue, 8 Oct 2019 08:58:48 -0700
Subject: [PATCH 109/113] Exponential decay of the weight of task loss (#27508)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27508

Implemented a simple exponential decay of the weight of lr loss function, with a lower bound.

Test Plan:
buck test //caffe2/caffe2/fb/dper/layer_models/tests:mtml_test -- test_task_weight_decay
https://our.intern.facebook.com/intern/testinfra/testrun/3377699729136308

canary: f140103452

Reviewed By: chenshouyuan

Differential Revision: D17524101

fbshipit-source-id: 9a653e21a4ecb74dfc4ac949c9e3388f36ef3a20
---
 caffe2/python/layers/batch_lr_loss.py | 55 +++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py
index f0d5a81f484fb..4ab03d80f928d 100644
--- a/caffe2/python/layers/batch_lr_loss.py
+++ b/caffe2/python/layers/batch_lr_loss.py
@@ -31,6 +31,8 @@ def __init__(
         uncertainty_penalty=1.0,
         focal_gamma=0.0,
         stop_grad_in_focal_factor=False,
+        task_gamma=1.0,
+        task_gamma_lb=0.1,
         **kwargs
     ):
         super(BatchLRLoss, self).__init__(model, name, input_record, **kwargs)
@@ -75,6 +77,44 @@ def __init__(
         self.focal_gamma = focal_gamma
         self.stop_grad_in_focal_factor = stop_grad_in_focal_factor
 
+        self.apply_exp_decay = False
+        if task_gamma < 1.0:
+            self.apply_exp_decay = True
+            self.task_gamma_cur = self.create_param(
+                param_name=('%s_task_gamma_cur' % self.name),
+                shape=[1],
+                initializer=(
+                    'ConstantFill', {
+                        'value': 1.0,
+                        'dtype': core.DataType.FLOAT
+                    }
+                ),
+                optimizer=self.model.NoOptim,
+            )
+
+            self.task_gamma = self.create_param(
+                param_name=('%s_task_gamma' % self.name),
+                shape=[1],
+                initializer=(
+                    'ConstantFill', {
+                        'value': task_gamma,
+                        'dtype': core.DataType.FLOAT
+                    }
+                ),
+                optimizer=self.model.NoOptim,
+            )
+
+            self.task_gamma_lb = self.create_param(
+                param_name=('%s_task_gamma_lb' % self.name),
+                shape=[1],
+                initializer=(
+                    'ConstantFill', {
+                        'value': task_gamma_lb,
+                        'dtype': core.DataType.FLOAT
+                    }
+                ),
+                optimizer=self.model.NoOptim,
+            )
 
     def init_weight(self, jsd_weight, homotopy_weighting):
         if homotopy_weighting:
@@ -197,6 +237,21 @@ def add_ops(self, net):
                 [xent, focal_factor], net.NextScopedBlob("focallossxent")
             )
 
+        if self.apply_exp_decay:
+            net.Mul(
+                [self.task_gamma_cur, self.task_gamma],
+                self.task_gamma_cur
+            )
+
+            task_gamma_multiplier = net.Max(
+                [self.task_gamma_cur, self.task_gamma_lb],
+                net.NextScopedBlob("task_gamma_cur_multiplier")
+            )
+
+            xent = net.Mul(
+                [xent, task_gamma_multiplier], net.NextScopedBlob("expdecayxent")
+            )
+
         # fuse with JSD
         if self.jsd_fuse:
             jsd = net.BernoulliJSD(

From a96b003b3965628013f8fbfa0e713bc8c2bde0cc Mon Sep 17 00:00:00 2001
From: Chris Gottbrath <gottbrath@fb.com>
Date: Tue, 8 Oct 2019 08:59:37 -0700
Subject: [PATCH 110/113] docstring only formatting changes: quantize.py,
 fake_quantize.py, observer.py

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/27415

Reviewed By: zafartahirov

Differential Revision: D17783101

Pulled By: gottbrath

fbshipit-source-id: a7acbc55edfaa75fdbd17fd30d530710a401b22f
---
 torch/quantization/fake_quantize.py | 17 +++++------------
 torch/quantization/observer.py      |  2 +-
 torch/quantization/quantize.py      | 13 ++++++-------
 3 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/torch/quantization/fake_quantize.py b/torch/quantization/fake_quantize.py
index 7f3924e715ee4..5c1bb274964b8 100644
--- a/torch/quantization/fake_quantize.py
+++ b/torch/quantization/fake_quantize.py
@@ -4,7 +4,7 @@
 from .observer import MovingAverageMinMaxObserver, HistogramObserver, MovingAveragePerChannelMinMaxObserver, _with_args
 
 class FakeQuantize(Module):
-    ''' Simulate the quantize and dequantize operations in training time.
+    r""" Simulate the quantize and dequantize operations in training time.
     The output of this module is given by
 
     x_out = (clamp(round(x/scale + zero_point), quant_min, quant_max)-zero_point)*scale
@@ -25,13 +25,13 @@ class FakeQuantize(Module):
     * :attr:`observer_enable` controls statistics collection on tensors
 
     * :attr:`dtype` specifies the quantized dtype that is being emulated with fake-quantization,
-     allowable values are torch.qint8 and torch.quint8. The values of quant_min and quant_max should
-     be chosen to be consistent with the dtype
+                    allowable values are torch.qint8 and torch.quint8. The values of quant_min and 
+                    quant_max should be chosen to be consistent with the dtype
 
 
     Args:
-        observer (module): Module for observing statistics on input tensors and calculating
-        scale and zero-point.
+        observer (module): Module for observing statistics on input tensors and calculating scale 
+                           and zero-point.
         quant_min (int): The minimum allowable quantized value.
         quant_max (int): The maximum allowable quantized value.
         observer_kwargs (optional): Arguments for the observer module
@@ -41,13 +41,6 @@ class FakeQuantize(Module):
                            provides a method to calculate scale and zero-point.
 
     """
-    Args:
-        `observer`: Observer module that records stats of input tensor
-        `quant_min`: Tensors are fake-quantized corresponding to the
-        `quant_max`: A function that calculates quantization parameters
-        given the stats
-        `observer_kwargs`
-    '''
     def __init__(self, observer=MovingAverageMinMaxObserver, quant_min=0, quant_max=255, **observer_kwargs):
         super(FakeQuantize, self).__init__()
         assert quant_min <= quant_max, \
diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py
index 46800dd997438..6aebb046d7f75 100644
--- a/torch/quantization/observer.py
+++ b/torch/quantization/observer.py
@@ -174,7 +174,7 @@ class MinMaxObserver(_ObserverBase):
     r"""Default Observer Module
     A default implementation of the observer module, only works for
     `per_tensor_affine` quantization scheme.  The module will record the
-     running average of max and min value of the observed Tensor and
+    running average of max and min value of the observed Tensor and
     calculate_qparams will calculate scale and zero_point
     """
 
diff --git a/torch/quantization/quantize.py b/torch/quantization/quantize.py
index 75457ca3830fb..ebcdedb19e8f5 100644
--- a/torch/quantization/quantize.py
+++ b/torch/quantization/quantize.py
@@ -80,12 +80,10 @@ def add_observer_(module):
     has a valid qconfig attribute.
 
     Args:
-        module: input module with qconfig attributes for all the leaf modules
-        that we want to quantize
+        module: input module with qconfig attributes for all the leaf modules that we want to quantize
 
     Return:
-        None, module is modified inplace with added observer modules and
-            forward_hooks
+        None, module is modified inplace with added observer modules and forward_hooks
     """
     for child in module.children():
         if type(child) == nnq.FloatFunctional:
@@ -252,7 +250,8 @@ def quantize_qat(model, run_fn, run_args, inplace=False):
     Args:
         model: input model
         run_fn: a function for evaluating the prepared model, can be a
-            function that simply runs the prepared model or a training loop
+                function that simply runs the prepared model or a training 
+                loop
         run_args: positional arguments for `run_fn`
 
     Return:
@@ -271,8 +270,8 @@ def convert(module, mapping=None, inplace=False):
     parameters) to a quantized module.
     Args:
         module: calibrated module with observers
-        mapping: a dictionary that maps from float module type to quantized
-           module type, can be overwrritten to allow swapping user defined Modules
+        mapping: a dictionary that maps from float module type to quantized module type, can 
+                 be overwrritten to allow swapping user defined Modules
         inplace: carry out model transformations in-place, the original module is mutated
     """
     if mapping is None:

From 6e0312a9c5df4ee2e1daabd6dcf558cbf09357d0 Mon Sep 17 00:00:00 2001
From: Edward Yang <ezyang@fb.com>
Date: Tue, 8 Oct 2019 09:15:24 -0700
Subject: [PATCH 111/113] Revert "Make static dispatch turn off variable before
 entering the kernel. (#26908)" (#27283)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27283

This reverts commit 9159a601ca4953ecf0d3dc568cd0b966de2d4686.

Test Plan: Imported from OSS

Differential Revision: D17738167

Pulled By: ezyang

fbshipit-source-id: cc4048d553017409279603590833d1529f59048c
---
 aten/src/ATen/function_wrapper.py       | 7 -------
 aten/src/ATen/templates/TensorMethods.h | 1 -
 2 files changed, 8 deletions(-)

diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
index b87d4292b5d41..e3baa023ab837 100644
--- a/aten/src/ATen/function_wrapper.py
+++ b/aten/src/ATen/function_wrapper.py
@@ -239,17 +239,10 @@ def TypedDict(name, attrs, total=True):  # type: ignore
 
 # In order to rely on the linker to strip unused ops, it requires us to dispatch statically
 # in Functions.h and TensorMethods.h.
-#
-# NB: The default body also needs to apply a variable guard, as in some
-# situations what we think is a default body actually does have an
-# explicit derivative, and thereby would have gotten unwrapped by
-# the time you get to the implementation.
 STATIC_DISPATCH_FUNCTION_DEFAULT_BODY = CodeTemplate("""\
-at::AutoNonVariableTypeMode _var_guard(true);
 ${return_call} TypeDefault::${native_type_method_dispatch}(${native_arguments});
 """)
 STATIC_DISPATCH_FUNCTION_SWITCH_BODY = CodeTemplate("""\
-at::AutoNonVariableTypeMode _var_guard(true);
 switch(tensorTypeIdToBackend(impl::dispatchTypeId(${type_set}))) {
     ${static_dispatch_function_switches}
     default:
diff --git a/aten/src/ATen/templates/TensorMethods.h b/aten/src/ATen/templates/TensorMethods.h
index 67640c08d43dd..da3f8fe1a5dbd 100644
--- a/aten/src/ATen/templates/TensorMethods.h
+++ b/aten/src/ATen/templates/TensorMethods.h
@@ -11,7 +11,6 @@
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/NamedTensor.h>
 #include <ATen/core/EnableNamedTensor.h>
-#include <ATen/core/LegacyTypeDispatch.h>
 
 #ifdef USE_STATIC_DISPATCH
 #include <ATen/TypeDefault.h>

From 16ece1c9daefe58ba56db63d9c930a21411998d9 Mon Sep 17 00:00:00 2001
From: Abbas <42001049+iam-abbas@users.noreply.github.com>
Date: Tue, 8 Oct 2019 09:28:07 -0700
Subject: [PATCH 112/113] Fixed typos and grammatical errors (#27465)

Summary:
Fixes https://github.com/pytorch/pytorch/issues/27443
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27465

Differential Revision: D17810732

Pulled By: pietern

fbshipit-source-id: b8a62dd086a4f4a61c9aa6acfa495cf822995604
---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 60366994dd880..8f8aa4ecf5962 100644
--- a/README.md
+++ b/README.md
@@ -124,7 +124,7 @@ You can write new neural network layers in Python using the torch API
 [or your favorite NumPy-based libraries such as SciPy](https://pytorch.org/tutorials/advanced/numpy_extensions_tutorial.html).
 
 If you want to write your layers in C/C++, we provide a convenient extension API that is efficient and with minimal boilerplate.
-There is no wrapper code that needs to be written. You can see [a tutorial here](https://pytorch.org/tutorials/advanced/cpp_extension.html) and [an example here](https://github.com/pytorch/extension-cpp).
+No wrapper code needs to be written. You can see [a tutorial here](https://pytorch.org/tutorials/advanced/cpp_extension.html) and [an example here](https://github.com/pytorch/extension-cpp).
 
 
 ## Installation
@@ -145,7 +145,7 @@ Python wheels for NVIDIA's Jetson Nano, Jetson TX2, and Jetson AGX Xavier are av
   - Python 2.7: https://nvidia.box.com/v/torch-weekly-cp27-jetson-jp42
   - Python 3.6: https://nvidia.box.com/v/torch-weekly-cp36-jetson-jp42
 
-They requires JetPack 4.2 and above and are maintained by @dusty-nv
+They require JetPack 4.2 and above, and @dusty-nv maintains them
 
 
 ### From Source
@@ -234,7 +234,7 @@ set FORCE_PY27_BUILD=1
 :: Note: This value is useless if Ninja is detected. However, you can force that by using `set USE_NINJA=OFF`.
 set CMAKE_GENERATOR=Visual Studio 15 2017
 
-:: Read the content in the previous section carefully before you preceed.
+:: Read the content in the previous section carefully before you proceed.
 :: [Optional] If you want to override the underlying toolset used by Ninja and Visual Studio with CUDA, please run the following script block.
 :: "Visual Studio 2017 Developer Command Prompt" will be run automatically.
 :: Make sure you have CMake >= 3.12 before you do this when you use the Visual Studio generator.

From 7f183a978f31a87a7350c526386cec92bb6b2740 Mon Sep 17 00:00:00 2001
From: Mike Ruberry <mruberry@devfair044.maas>
Date: Tue, 8 Oct 2019 09:50:28 -0700
Subject: [PATCH 113/113] Stops common_utils.py from setting the default tensor
 type (to torch.DoubleTensor) (#27444)

Summary:
This PR stop common_utils.py from setting the default tensor type when it's imported. See issue https://github.com/pytorch/pytorch/issues/27355. This is a frequent source of confusion for test writers.

Many tests relied on this setting (whether they knew it or not), and this PR also updates the test suite to pass without common_utils.py setting the default tensor type. Some larger test files now set the default floating dtype themselves, however. These test files are:

- test_autograd.py
- test_distributions.py
- test_jit.py
- test_nn.py

This is still a significant improvement from today, however. First, these files set the default floating dtype much more clearly than importing it from common_utils. Second, the rest of the test suite no longer sets this globally. Third, this PR is a springboard to updating those tests, too. In particular, as tests are made generic they can be moved aways from relying on this global setting.

Notable technical changes in this PR are:

- Significant updates to test_torch.py to make it pass without setting the default floating dtype globally.
- The default_floating_dtype decorator is now defined in common_utils, a couple versions of this operator were defined in test files previously.
- test_torch-specific parts of common_utils were refactored into test_torch.
- tensor creation methods in common_utils were updated to accept an optional dtype and device.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/27444

Differential Revision: D17795235

Pulled By: mruberry

fbshipit-source-id: 7f77271c0c836e69f183ad9057a2c4b29f09d2e1
---
 test/common_device_type.py      |   5 +-
 test/common_utils.py            |  84 ++--
 test/test_autograd.py           |   7 +-
 test/test_c10d.py               |   4 +-
 test/test_cpp_extensions.py     |   4 +
 test/test_cuda.py               |  11 +-
 test/test_dataloader.py         |   3 +-
 test/test_dist_autograd_fork.py |   5 +
 test/test_distributions.py      |   5 +
 test/test_jit.py                |  29 +-
 test/test_namedtensor.py        |   3 +-
 test/test_nn.py                 |  29 +-
 test/test_optim.py              |  13 +-
 test/test_rpc_fork.py           |   5 +
 test/test_sparse.py             |  13 +-
 test/test_torch.py              | 672 +++++++++++++++++---------------
 16 files changed, 477 insertions(+), 415 deletions(-)

diff --git a/test/common_device_type.py b/test/common_device_type.py
index b3352182c643d..24a70410918b1 100644
--- a/test/common_device_type.py
+++ b/test/common_device_type.py
@@ -196,11 +196,10 @@ def get_all_devices(cls):
         primary_device_idx = int(cls.get_primary_device().split(':')[1])
         num_devices = torch.cuda.device_count()
 
-        devices = [cls.get_primary_device()]
+        prim_device = cls.get_primary_device()
         cuda_str = 'cuda:{0}'
         non_primary_devices = [cuda_str.format(idx) for idx in range(num_devices) if idx != primary_device_idx]
-        devices.extend(non_primary_devices)
-        return devices
+        return [prim_device] + non_primary_devices
 
     @classmethod
     def setUpClass(cls):
diff --git a/test/common_utils.py b/test/common_utils.py
index 9a4e0c00561eb..5be92588cbf91 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -42,7 +42,6 @@
 import torch.backends.mkl
 
 
-torch.set_default_tensor_type('torch.DoubleTensor')
 torch.backends.disable_global_flags()
 
 
@@ -298,6 +297,20 @@ def wrapper(*args, **kwargs):
             fn(*args, **kwargs)
     return wrapper
 
+def default_floating_dtype(dtype):
+    def decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            old_type = torch.tensor(()).dtype
+            torch.set_default_dtype(dtype)
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                torch.set_default_dtype(old_type)
+
+        return wrapper
+
+    return decorator
 
 def get_cpu_type(type_name):
     module, name = type_name.rsplit('.', 1)
@@ -970,7 +983,6 @@ def runWithPytorchAPIUsageStderr(code):
         assertNotRegex = unittest.TestCase.assertNotRegexpMatches
 
 
-
 def download_file(url, binary=True):
     if sys.version_info < (3,):
         from urlparse import urlsplit
@@ -1034,9 +1046,9 @@ def prod_single_zero(dim_size):
     return result
 
 
-def random_square_matrix_of_rank(l, rank):
+def random_square_matrix_of_rank(l, rank, dtype=torch.double, device='cpu'):
     assert rank <= l
-    A = torch.randn(l, l)
+    A = torch.randn(l, l, dtype=dtype, device=device)
     u, s, v = A.svd()
     for i in range(l):
         if i >= rank:
@@ -1046,20 +1058,28 @@ def random_square_matrix_of_rank(l, rank):
     return u.mm(torch.diag(s)).mm(v.transpose(0, 1))
 
 
-def random_symmetric_matrix(l, *batches):
-    A = torch.randn(*(batches + (l, l)))
+def random_symmetric_matrix(l, *batches, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
     A = (A + A.transpose(-2, -1)).div_(2)
     return A
 
 
-def random_symmetric_psd_matrix(l, *batches):
-    A = torch.randn(*(batches + (l, l)))
+def random_symmetric_psd_matrix(l, *batches, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batches + (l, l)), dtype=dtype, device=device)
     return torch.matmul(A, A.transpose(-2, -1))
 
 
-def random_symmetric_pd_matrix(matrix_size, *batch_dims):
-    A = torch.randn(*(batch_dims + (matrix_size, matrix_size)))
-    return torch.matmul(A, A.transpose(-2, -1)) + torch.eye(matrix_size) * 1e-5
+def random_symmetric_pd_matrix(matrix_size, *batch_dims, **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
+    A = torch.randn(*(batch_dims + (matrix_size, matrix_size)),
+                    dtype=dtype, device=device)
+    return torch.matmul(A, A.transpose(-2, -1)) \
+        + torch.eye(matrix_size, dtype=dtype, device=device) * 1e-5
 
 
 def make_nonzero_det(A, sign=None, min_singular_value=0.1):
@@ -1080,48 +1100,20 @@ def make_nonzero_det(A, sign=None, min_singular_value=0.1):
     return A
 
 
-def random_fullrank_matrix_distinct_singular_value(matrix_size, *batch_dims, **kwargs):
+def random_fullrank_matrix_distinct_singular_value(matrix_size, *batch_dims,
+                                                   **kwargs):
+    dtype = kwargs.get('dtype', torch.double)
+    device = kwargs.get('device', 'cpu')
     silent = kwargs.get("silent", False)
     if silent and not torch._C.has_lapack:
-        return torch.ones(matrix_size, matrix_size)
+        return torch.ones(matrix_size, matrix_size, dtype=dtype, device=device)
 
-    A = torch.randn(batch_dims + (matrix_size, matrix_size))
+    A = torch.randn(batch_dims + (matrix_size, matrix_size), dtype=dtype, device=device)
     u, _, v = A.svd()
-    s = torch.arange(1., matrix_size + 1).mul_(1.0 / (matrix_size + 1)).diag()
+    s = torch.arange(1., matrix_size + 1, dtype=dtype, device=device).mul_(1.0 / (matrix_size + 1)).diag()
     return u.matmul(s.expand(batch_dims + (matrix_size, matrix_size)).matmul(v.transpose(-2, -1)))
 
 
-def lu_solve_test_helper(self, A_dims, b_dims, cast, pivot):
-    b = cast(torch.randn(*b_dims))
-    A = cast(random_fullrank_matrix_distinct_singular_value(*A_dims))
-    LU_data, LU_pivots, info = torch.lu(A, get_infos=True, pivot=pivot)
-    self.assertEqual(info, torch.zeros_like(info))
-    return b, A, LU_data, LU_pivots
-
-
-def cholesky_solve_test_helper(A_dims, b_dims, cast, upper):
-    b = cast(torch.randn(*b_dims))
-    A = cast(random_symmetric_pd_matrix(*A_dims))
-    L = torch.cholesky(A, upper=upper)
-    return b, A, L
-
-
-def triangular_solve_test_helper(A_dims, b_dims, cast, upper, unitriangular):
-    triangle_function = torch.triu if upper else torch.tril
-    b = cast(torch.randn(*b_dims))
-    A = cast(torch.randn(*A_dims))
-    A_triangular = triangle_function(A)
-    if unitriangular:
-        A_triangular.diagonal(dim1=-2, dim2=-1).fill_(1.)
-    return b, A_triangular
-
-
-def solve_test_helper(A_dims, b_dims, cast):
-    b = cast(torch.randn(*b_dims))
-    A = cast(random_fullrank_matrix_distinct_singular_value(*A_dims))
-    return b, A
-
-
 def brute_pdist(inp, p=2):
     """Computes the same as torch.pdist using primitives"""
     n = inp.shape[-2]
diff --git a/test/test_autograd.py b/test/test_autograd.py
index 6e17cc737ad5d..1854b60144524 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -4,7 +4,6 @@
 import math
 import tempfile
 import time
-import torch
 import unittest
 import warnings
 from copy import deepcopy
@@ -12,6 +11,12 @@
 from itertools import product
 from operator import mul
 from functools import reduce
+import torch
+
+# TODO: remove this global setting
+# Autograd tests use double as the default dtype
+torch.set_default_dtype(torch.double)
+
 from torch import nn
 from torch._six import inf, nan, istuple
 from torch.autograd.gradcheck import gradgradcheck, gradcheck
diff --git a/test/test_c10d.py b/test/test_c10d.py
index e5d70ba53df22..509c2478fc5f9 100644
--- a/test/test_c10d.py
+++ b/test/test_c10d.py
@@ -28,7 +28,7 @@
 from common_distributed import MultiProcessTestCase, \
     requires_gloo, requires_nccl, requires_nccl_version, \
     skip_if_not_multigpu, skip_if_lt_x_gpu, skip_for_known_issues, get_timeout
-from common_utils import TestCase, load_tests, run_tests
+from common_utils import TestCase, load_tests, run_tests, default_floating_dtype
 from common_utils import retry_on_address_already_in_use_error
 
 # load_tests from common_utils is used to automatically filter tests for
@@ -2070,6 +2070,7 @@ def test_dist_broadcast_coalesced_gloo(self):
 
     @requires_gloo()
     @skip_if_not_multigpu
+    @default_floating_dtype(torch.double)
     def test_sync_params_no_buffers(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
@@ -2097,6 +2098,7 @@ def test_sync_params_no_buffers(self):
 
     @requires_gloo()
     @skip_if_not_multigpu
+    @default_floating_dtype(torch.double)
     def test_sync_params_with_buffers(self):
         store = c10d.FileStore(self.file_name, self.world_size)
         options = c10d.ProcessGroupGloo.Options()
diff --git a/test/test_cpp_extensions.py b/test/test_cpp_extensions.py
index 289b6ff6ba82d..ee4a557e10d7a 100644
--- a/test/test_cpp_extensions.py
+++ b/test/test_cpp_extensions.py
@@ -9,6 +9,7 @@
 import glob
 
 import common_utils as common
+from common_utils import default_floating_dtype
 import torch
 import torch.backends.cudnn
 import torch.utils.cpp_extension
@@ -65,6 +66,7 @@ def test_extension_function(self):
         z = cpp_extension.sigmoid_add(x, y)
         self.assertEqual(z, x.sigmoid() + y.sigmoid())
 
+    @default_floating_dtype(torch.double)
     def test_extension_module(self):
         mm = cpp_extension.MatrixMultiplier(4, 8)
         weights = torch.rand(8, 4)
@@ -72,6 +74,7 @@ def test_extension_module(self):
         result = mm.forward(weights)
         self.assertEqual(expected, result)
 
+    @default_floating_dtype(torch.double)
     def test_backward(self):
         mm = cpp_extension.MatrixMultiplier(4, 8)
         weights = torch.rand(8, 4, requires_grad=True)
@@ -474,6 +477,7 @@ def compile(code):
 
     @dont_wipe_extensions_build_folder
     @common.skipIfRocm
+    @default_floating_dtype(torch.double)
     def test_cpp_frontend_module_has_same_output_as_python(self):
         extension = torch.utils.cpp_extension.load(
             name="cpp_frontend_extension",
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 3ce95bfaf1e58..0b0d8b862fe1b 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -23,7 +23,7 @@
     _compare_trilu_indices, _compare_large_trilu_indices
 from common_utils import TestCase, get_gpu_type, freeze_rng_state, run_tests, \
     PY3, IS_WINDOWS, NO_MULTIPROCESSING_SPAWN, skipIfRocm, \
-    load_tests, slowTest, skipCUDANonDefaultStreamIf
+    load_tests, slowTest, skipCUDANonDefaultStreamIf, default_floating_dtype
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -433,9 +433,9 @@ def test_serialization_array_with_storage(self):
         self.assertEqual(q_copy, q, 0)
         q_copy[0].fill_(5)
         self.assertEqual(q_copy[0], q_copy[2], 0)
-        self.assertTrue(isinstance(q_copy[0], torch.cuda.DoubleTensor))
+        self.assertTrue(isinstance(q_copy[0], torch.cuda.FloatTensor))
         self.assertTrue(isinstance(q_copy[1], torch.cuda.IntTensor))
-        self.assertTrue(isinstance(q_copy[2], torch.cuda.DoubleTensor))
+        self.assertTrue(isinstance(q_copy[2], torch.cuda.FloatTensor))
         self.assertTrue(isinstance(q_copy[3], torch.cuda.IntStorage))
         q_copy[1].fill_(10)
         self.assertTrue(q_copy[3], torch.cuda.IntStorage(10).fill_(10))
@@ -443,14 +443,14 @@ def test_serialization_array_with_storage(self):
     def test_type_conversions(self):
         x = torch.randn(5, 5)
         self.assertIsInstance(x.float(), torch.FloatTensor)
-        self.assertIsInstance(x.cuda(), torch.cuda.DoubleTensor)
+        self.assertIsInstance(x.cuda().double(), torch.cuda.DoubleTensor)
         self.assertIsInstance(x.cuda().float(), torch.cuda.FloatTensor)
         self.assertIsInstance(x.cuda().float().cpu(), torch.FloatTensor)
         self.assertIsInstance(x.cuda().float().cpu().int(), torch.IntTensor)
 
         y = x.storage()
         self.assertIsInstance(y.float(), torch.FloatStorage)
-        self.assertIsInstance(y.cuda(), torch.cuda.DoubleStorage)
+        self.assertIsInstance(y.cuda().double(), torch.cuda.DoubleStorage)
         self.assertIsInstance(y.cuda().float(), torch.cuda.FloatStorage)
         self.assertIsInstance(y.cuda().float().cpu(), torch.FloatStorage)
         self.assertIsInstance(y.cuda().float().cpu().int(), torch.IntStorage)
@@ -1641,6 +1641,7 @@ def test_nvtx(self):
         torch.cuda.nvtx.mark("bar")
         torch.cuda.nvtx.range_pop()
 
+    @default_floating_dtype(torch.double)
     def test_bincount_ext(self):
         # ensure CUDA code coverage
         input_size = (5000,)
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
index d450e6005dc85..ca8c9021bb902 100644
--- a/test/test_dataloader.py
+++ b/test/test_dataloader.py
@@ -17,7 +17,7 @@
 from torch._utils import ExceptionWrapper
 from common_utils import (TestCase, run_tests, TEST_NUMPY, IS_WINDOWS, PY3,
                           IS_PYTORCH_CI, NO_MULTIPROCESSING_SPAWN, skipIfRocm,
-                          load_tests, TEST_WITH_TSAN)
+                          load_tests, TEST_WITH_TSAN, default_floating_dtype)
 
 try:
     import psutil
@@ -1485,6 +1485,7 @@ def check_len(dl, expected):
         check_len(DataLoader(self.dataset, batch_size=3), 34)
 
     @unittest.skipIf(not TEST_NUMPY, "numpy unavailable")
+    @default_floating_dtype(torch.double)
     def test_numpy_scalars(self):
         import numpy as np
 
diff --git a/test/test_dist_autograd_fork.py b/test/test_dist_autograd_fork.py
index f64c13ec2e96e..e57b4c3803b57 100644
--- a/test/test_dist_autograd_fork.py
+++ b/test/test_dist_autograd_fork.py
@@ -1,6 +1,11 @@
 #!/usr/bin/env python3
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import torch
+
+# dist_autograd_fork tests use double as the default dtype
+torch.set_default_dtype(torch.double)
+
 from dist_autograd_test import DistAutogradTest
 from common_distributed import MultiProcessTestCase
 from common_utils import run_tests
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 09073e970090b..8b3265a2b7f2e 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -30,6 +30,11 @@
 from random import shuffle
 
 import torch
+
+# TODO: remove this global setting
+# Distributions tests use double as the default dtype
+torch.set_default_dtype(torch.double)
+
 from torch._six import inf
 from common_utils import TestCase, run_tests, set_rng_seed, TEST_WITH_UBSAN, load_tests
 from common_cuda import TEST_CUDA
diff --git a/test/test_jit.py b/test/test_jit.py
index a1a09fbb24322..a21e17ced3716 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -1,5 +1,10 @@
 # -*- coding: UTF-8 -*-
 from __future__ import division
+import torch
+
+# TODO: remove this global setting
+# JIT tests use double as the default dtype
+torch.set_default_dtype(torch.double)
 
 # Torch
 from torch import Tensor
@@ -10,7 +15,6 @@
 from torch.jit.frontend import NotSupportedError
 from torch.onnx import OperatorExportTypes
 from torch.testing import FileCheck
-import torch
 import torch.cuda
 import torch.jit
 import torch.jit._logging
@@ -26,7 +30,8 @@
 # Testing utils
 from common_utils import run_tests, IS_WINDOWS, TEST_WITH_UBSAN, \
     skipIfRocm, skipIfNoLapack, suppress_warnings, load_tests, IS_SANDCASTLE, \
-    freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy
+    freeze_rng_state, set_rng_seed, slowTest, TemporaryFileName, skipIfCompiledWithoutNumpy, \
+    default_floating_dtype
 from jit_utils import JitTestCase, enable_cpu_fuser, disable_autodiff_subgraph_inlining, \
     _trace, enable_cpu_fuser_if, enable_profiling_mode, do_input_map, \
     execWrapper, _inline_everything, _tmp_donotuse_dont_inline_everything, \
@@ -89,24 +94,6 @@
 
 PY35 = sys.version_info >= (3, 5)
 
-def default_tensor_type(type):
-    type_str = torch.typename(type)
-
-    def decorator(fn):
-        @wraps(fn)
-        def wrapper(*args, **kwargs):
-            old_type = torch.Tensor().type()
-            torch.set_default_tensor_type(type_str)
-            try:
-                return fn(*args, **kwargs)
-            finally:
-                torch.set_default_tensor_type(old_type)
-
-        return wrapper
-
-    return decorator
-
-
 def LSTMCellF(input, hx, cx, *params):
     return LSTMCell(input, (hx, cx), *params)
 
@@ -7888,7 +7875,7 @@ def test_binary_op_shape(self):
         self._test_binary_op_shape(['mul', 'div', 'add', 'sub'], 0)
         self._test_binary_op_shape(['mul', 'div', 'add', 'sub'], 3)
 
-    @default_tensor_type(torch.FloatTensor)
+    @default_floating_dtype(torch.float)
     def test_wrapped_number(self):
         # Scalar's get converted to 'wrapped' tensors of default tensor type.
         # Wrapped tensors behave differently in certain promotion operations:
diff --git a/test/test_namedtensor.py b/test/test_namedtensor.py
index 4377bdbd8f92f..015ab594fa08a 100644
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@@ -1,5 +1,5 @@
 import unittest
-from common_utils import TestCase, run_tests, TEST_NUMPY
+from common_utils import TestCase, run_tests, TEST_NUMPY, default_floating_dtype
 from common_cuda import TEST_CUDA
 from collections import namedtuple, OrderedDict
 import itertools
@@ -275,6 +275,7 @@ def test_no_multiprocessing_support(self):
         with self.assertRaisesRegex(RuntimeError, "NYI"):
             ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(named_tensor)
 
+    @default_floating_dtype(torch.double)
     def test_big_tensor_repr(self):
         def check_repr(named_tensor):
             unnamed_tensor = named_tensor.rename(None)
diff --git a/test/test_nn.py b/test/test_nn.py
index 0fe2c005eb0f3..96f7a6ee509d9 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -16,6 +16,11 @@
 import threading
 
 import torch
+
+# TODO: remove this global setting
+# NN tests use double as the default dtype
+torch.set_default_dtype(torch.double)
+
 from torch._six import inf, nan
 import torch.backends.cudnn as cudnn
 import torch.nn as nn
@@ -31,7 +36,7 @@
 from torch.nn.parallel._functions import Broadcast
 from common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
     TEST_NUMPY, TEST_SCIPY, download_file, PY3, PY34, to_gpu, \
-    get_function_arglist, load_tests
+    get_function_arglist, load_tests, default_floating_dtype
 from common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, TEST_CUDNN_VERSION
 from common_nn import NNTestCase, ModuleTest, CriterionTest, TestBase, \
     module_tests, criterion_tests, new_criterion_tests, loss_reference_fns, \
@@ -203,24 +208,6 @@ def test_to(self):
                     self.assertEqual(b.long(), b.to(dtype=torch.int64))
 
 
-def default_tensor_type(type):
-    type_str = torch.typename(type)
-
-    def decorator(fn):
-        @wraps(fn)
-        def wrapper(*args, **kwargs):
-            old_type = torch.Tensor().type()
-            torch.set_default_tensor_type(type_str)
-            try:
-                return fn(*args, **kwargs)
-            finally:
-                torch.set_default_tensor_type(old_type)
-
-        return wrapper
-
-    return decorator
-
-
 def _assertGradAndGradgradChecks(test_case, apply_fn, inputs):
     # call assert function rather than returning a bool since it's nicer
     # if we get whether this failed on the gradcheck or the gradgradcheck.
@@ -5496,7 +5483,7 @@ def compare_cpu_gpu(outputs_cpu, outputs_gpu):
             compare_cpu_gpu(outputs_cpu, outputs_gpu)
 
     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
-    @default_tensor_type(torch.FloatTensor)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
+    @default_floating_dtype(torch.float)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
     def test_RNN_cpu_vs_cudnn_no_dropout(self):
         self._test_RNN_cpu_vs_cudnn(0)
 
@@ -5522,7 +5509,7 @@ def test_RNN_cudnn_weight_norm(self):
         self.assertEqual(m(input), expected_output)
 
     @unittest.skipIf(not (TEST_CUDNN and (TEST_CUDNN_VERSION if TEST_CUDNN_VERSION else 0) >= 5103), "needs cudnn >= 5.1")
-    @default_tensor_type(torch.FloatTensor)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
+    @default_floating_dtype(torch.float)  # FIXME: just until torch.cuda.DoubleTensor.sum() implemented
     def test_RNN_cpu_vs_cudnn_with_dropout(self):
         # Because of dropout randomness, can only compare dropout=0 and dropout=1
         self._test_RNN_cpu_vs_cudnn(1)
diff --git a/test/test_optim.py b/test/test_optim.py
index 7cd0c85e22924..8f93732fe023b 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -13,7 +13,8 @@
 from torch.optim.lr_scheduler import LambdaLR, StepLR, \
     MultiStepLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, \
     _LRScheduler, CyclicLR, CosineAnnealingWarmRestarts, OneCycleLR
-from common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests
+from common_utils import TestCase, run_tests, TEST_WITH_UBSAN, load_tests, \
+    skipIfRocm
 
 # load_tests from common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -27,7 +28,7 @@ def rosenbrock(tensor):
 
 def drosenbrock(tensor):
     x, y = tensor
-    return torch.DoubleTensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
+    return torch.Tensor((-400 * x * (y - x ** 2) - 2 * (1 - x), 200 * (y - x ** 2)))
 
 
 class TestOptim(TestCase):
@@ -61,12 +62,12 @@ def eval(params, sparse_grad, w):
             if w:
                 i = torch.LongTensor([[0, 0]])
                 x = grad[0]
-                v = torch.DoubleTensor([x / 4., x - x / 4.])
+                v = torch.Tensor([x / 4., x - x / 4.])
             else:
                 i = torch.LongTensor([[1, 1]])
                 y = grad[1]
-                v = torch.DoubleTensor([y - y / 4., y / 4.])
-            x = sparse.DoubleTensor(i, v, torch.Size([2]))
+                v = torch.Tensor([y - y / 4., y / 4.])
+            x = sparse.DoubleTensor(i, v, torch.Size([2])).to(dtype=v.dtype)
             with torch.no_grad():
                 if sparse_grad:
                     params.grad = x
@@ -342,6 +343,8 @@ def test_sparse_adam(self):
         with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
             optim.SparseAdam(None, lr=1e-2, betas=(1.0, 0.0))
 
+    # ROCm precision is too low to pass this test
+    @skipIfRocm
     def test_adadelta(self):
         self._test_basic_cases(
             lambda weight, bias: optim.Adadelta([weight, bias])
diff --git a/test/test_rpc_fork.py b/test/test_rpc_fork.py
index 5e2432f60183e..8a082b966d338 100644
--- a/test/test_rpc_fork.py
+++ b/test/test_rpc_fork.py
@@ -1,6 +1,11 @@
 #!/usr/bin/env python3
 from __future__ import absolute_import, division, print_function, unicode_literals
 
+import torch
+
+# rpc_fork tests use double as the default dtype
+torch.set_default_dtype(torch.double)
+
 from rpc_test import RpcTest
 from common_distributed import MultiProcessTestCase
 from common_utils import run_tests
diff --git a/test/test_sparse.py b/test/test_sparse.py
index c470ed66d9bbf..7eacf9f8d1106 100644
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@@ -5,7 +5,8 @@
 import random
 import sys
 import unittest
-from common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, do_test_empty_full, load_tests
+from common_utils import TestCase, run_tests, skipIfRocm, do_test_dtypes, \
+    do_test_empty_full, load_tests, default_floating_dtype
 from common_cuda import TEST_CUDA
 from numbers import Number
 from torch.autograd.gradcheck import gradcheck
@@ -94,6 +95,7 @@ def randn(self, *args, **kwargs):
         # TODO: Put this in torch.cuda.randn
         return self.value_empty(*args, **kwargs).normal_()
 
+    @default_floating_dtype(torch.double)
     def test_print(self):
         shape_sparse_dim_nnz = [
             ((), 0, 2),
@@ -925,6 +927,7 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 0, 100, 0)
         test_shape(1000, 100, 0, 0)
 
+    @default_floating_dtype(torch.double)
     def test_sparse_addmm(self):
         def test_shape(m, n, p, nnz, broadcast):
             if broadcast:
@@ -944,6 +947,7 @@ def fn(S, D1, D2):
         test_shape(7, 8, 9, 20, False)
         test_shape(7, 8, 9, 20, True)
 
+    @default_floating_dtype(torch.double)
     def test_sparse_mm(self):
         def test_shape(d1, d2, d3, nnz, transposed):
             if transposed:
@@ -963,6 +967,7 @@ def fn(S, D):
         test_shape(7, 8, 9, 20, False)
         test_shape(7, 8, 9, 20, True)
 
+    @default_floating_dtype(torch.double)
     def test_dsmm(self):
         def test_shape(di, dj, dk, nnz):
             x = self._gen_sparse(2, nnz, [di, dj])[0]
@@ -981,6 +986,7 @@ def test_shape(di, dj, dk, nnz):
         test_shape(1000, 100, 0, 20)
 
     @skipIfRocm
+    @default_floating_dtype(torch.double)
     def test_hsmm(self):
         def test_shape(di, dj, dk, nnz):
             x = self._gen_sparse(2, nnz, [di, dj])[0]
@@ -1043,6 +1049,7 @@ def _test_spadd_shape(self, nnz, shape_i, shape_v=None):
         expected = y + r * self.safeToDense(x_)
         self.assertEqual(res, expected)
 
+    @default_floating_dtype(torch.double)
     def test_spadd(self):
         self._test_spadd_shape(10, [5, 6])
         self._test_spadd_shape(10, [10, 10, 10])
@@ -1052,6 +1059,7 @@ def test_spadd(self):
         self._test_spadd_shape(0, [50, 0, 20])
         self._test_spadd_shape(0, [50, 30, 0])
 
+    @default_floating_dtype(torch.double)
     def test_spadd_hybrid(self):
         self._test_spadd_shape(10, [5, 6], [2, 3])
         self._test_spadd_shape(10, [10, 10, 10], [3])
@@ -1073,6 +1081,7 @@ def test_shape(sparse_dims, nnz, with_size):
         test_shape(4, 0, [0, 0, 100, 5, 5, 5, 0])
 
     @skipIfRocm
+    @default_floating_dtype(torch.double)
     def test_sparse_sum(self):
 
         def run_tests(S, td=None):
@@ -1213,6 +1222,7 @@ def _test_basic_ops_shape(self, nnz_x1, nnz_x2, shape_i, shape_v=None):
             # coalesced.
             self.assertEqual(z._values(), y._values())
 
+    @default_floating_dtype(torch.double)
     def test_basic_ops(self):
         self._test_basic_ops_shape(9, 12, [5, 6])
         self._test_basic_ops_shape(9, 12, [10, 10, 10])
@@ -1223,6 +1233,7 @@ def test_basic_ops(self):
         self._test_basic_ops_shape(0, 0, [10, 10, 10])
         self._test_basic_ops_shape(0, 0, [10, 10, 0])
 
+    @default_floating_dtype(torch.double)
     def test_basic_ops_hybrid(self):
         self._test_basic_ops_shape(9, 12, [5, 6], [2, 3])
         self._test_basic_ops_shape(9, 12, [10, 10, 10], [3])
diff --git a/test/test_torch.py b/test/test_torch.py
index 18be60b363324..be0a16ef04af0 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -31,7 +31,8 @@
     TEST_LIBROSA, run_tests, download_file, skipIfNoLapack, suppress_warnings, \
     IS_WINDOWS, PY3, NO_MULTIPROCESSING_SPAWN, do_test_dtypes, do_test_empty_full, \
     IS_SANDCASTLE, load_tests, brute_pdist, brute_cdist, slowTest, \
-    skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf
+    skipCUDANonDefaultStreamIf, skipCUDAMemoryLeakCheckIf, \
+    default_floating_dtype
 from multiprocessing.reduction import ForkingPickler
 from common_device_type import instantiate_device_type_tests, \
     skipCPUIfNoLapack, skipCUDAIfNoMagma, skipCUDAIfRocm, onlyCUDA, onlyCPU, \
@@ -1319,6 +1320,7 @@ def test_cpow(self):
 
     @slowTest
     @unittest.skipIf(not TEST_NUMPY, 'Numpy not found')
+    @default_floating_dtype(torch.double)
     def test_einsum(self):
         # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f
         x = torch.randn(5)
@@ -1388,6 +1390,7 @@ def do_einsum(*args):
                 self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps))
             self.assertTrue(A._version == 0)  # check that we do not use inplace ops
 
+    @default_floating_dtype(torch.double)
     def test_sum_all(self):
         def check_sum_all(tensor):
             pylist = tensor.reshape(-1).tolist()
@@ -1487,6 +1490,7 @@ def test_logsumexp_dim(self):
             lambda n, d: logsumexp(n, d),
             use_integral=False)
 
+    @default_floating_dtype(torch.double)
     def test_sum_out(self):
         x = torch.rand(100, 100)
         res1 = torch.sum(x, 1)
@@ -2924,6 +2928,7 @@ def test_ormqr(self):
         self.assertEqual(res2, out_holder)
 
     @skipIfNoLapack
+    @default_floating_dtype(torch.double)
     def test_eig(self):
         a = torch.Tensor(((1.96, 0.00, 0.00, 0.00, 0.00),
                           (-6.49, 3.80, 0.00, 0.00, 0.00),
@@ -2967,6 +2972,7 @@ def test_eig(self):
         self.assertEqual(X, Xhat, 1e-8, 'VeV\' wrong')
 
     @staticmethod
+    @default_floating_dtype(torch.double)
     def _test_fft_ifft_rfft_irfft(self, device='cpu'):
         def _test_complex(sizes, signal_ndim, prepro_fn=lambda x: x):
             x = prepro_fn(torch.randn(*sizes, device=device))
@@ -3746,6 +3752,7 @@ def _test_abs_single(data):
                 for num in abs_zeros:
                     self.assertGreater(math.copysign(1.0, num), 0.0)
 
+    @default_floating_dtype(torch.double)
     def test_hardshrink(self):
         data_original = torch.tensor([1, 0.5, 0.3, 0.6]).view(2, 2)
         float_types = [
@@ -3763,6 +3770,7 @@ def test_hardshrink(self):
             # test non-contiguous case
             self.assertEqual(torch.tensor([1, 0, 0.5, 0.6]).view(2, 2), data.t().hardshrink(0.3))
 
+    @default_floating_dtype(torch.double)
     def test_hardshrink_edge_cases(self):
         def h(t, values, l_expected):
             for l, expected in l_expected.items():
@@ -6624,11 +6632,12 @@ def add_neg_dim_tests():
 
 # Device-generic tests. Instantiated below and not run directly.
 class TestTorchDeviceType(TestCase):
-    def check_internal_mem_overlap(self, inplace_op, num_inputs, device,
+    def check_internal_mem_overlap(self, inplace_op, num_inputs,
+                                   dtype, device,
                                    expected_failure=False):
         if isinstance(inplace_op, str):
             inplace_op = getattr(torch.Tensor, inplace_op)
-        input = torch.randn(1, device=device).expand(3, 3)
+        input = torch.randn(1, dtype=dtype, device=device).expand(3, 3)
         inputs = [input] + [torch.randn_like(input)
                             for i in range(num_inputs - 1)]
         if not expected_failure:
@@ -7174,7 +7183,8 @@ def test_inverse_many_batches(self, device):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_pinverse(self, device):
+    @dtypes(torch.double)
+    def test_pinverse(self, device, dtype):
         from common_utils import random_fullrank_matrix_distinct_singular_value as fullrank
 
         def run_test(M):
@@ -7191,15 +7201,15 @@ def run_test(M):
                       (3, 2), (5, 3, 2), (7, 5, 3, 2),  # fat matrices
                       (2, 3), (5, 2, 3), (7, 5, 2, 3),  # thin matrices
                       (0, 0), (0, 2), (2, 0), (3, 0, 0), (0, 3, 0), (0, 0, 3)]:  # zero numel matrices
-            M = torch.randn(*sizes, device=device)
+            M = torch.randn(*sizes, dtype=dtype, device=device)
             run_test(M)
 
         # Test inverse and pseudo-inverse for invertible matrix
         for sizes in [(5, 5), (3, 5, 5), (3, 7, 5, 5)]:
             matsize = sizes[-1]
             batchdims = sizes[:-2]
-            M = fullrank(matsize, *batchdims).to(device=device)
-            self.assertEqual(torch.eye(matsize, device=device).expand(sizes), M.pinverse().matmul(M),
+            M = fullrank(matsize, *batchdims, dtype=dtype, device=device)
+            self.assertEqual(torch.eye(matsize, dtype=dtype, device=device).expand(sizes), M.pinverse().matmul(M),
                              1e-7, 'pseudo-inverse for invertible matrix')
 
     @skipCUDAIfNoMagma
@@ -7237,7 +7247,8 @@ def test_matrix_rank(self, device):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_matrix_power(self, device):
+    @dtypes(torch.double)
+    def test_matrix_power(self, device, dtype):
         def run_test(M, sign=1):
             if sign == -1:
                 M = M.inverse()
@@ -7257,47 +7268,49 @@ def run_test(M, sign=1):
             self.assertEqual(MP0, torch.eye(M.size(-2)).expand_as(M))
 
         # Single matrix
-        M = torch.randn(5, 5, device=device)
+        M = torch.randn(5, 5, dtype=dtype, device=device)
         run_test(M)
 
         # Batch matrices
-        M = torch.randn(3, 3, 3, device=device)
+        M = torch.randn(3, 3, 3, dtype=dtype, device=device)
         run_test(M)
 
         # Many batch matrices
-        M = torch.randn(2, 3, 3, 3, device=device)
+        M = torch.randn(2, 3, 3, 3, dtype=dtype, device=device)
         run_test(M)
 
         # This is for negative powers
         from common_utils import random_fullrank_matrix_distinct_singular_value
-        M = random_fullrank_matrix_distinct_singular_value(5).to(device)
+        M = random_fullrank_matrix_distinct_singular_value(5, dtype=dtype, device=device)
         run_test(M, sign=-1)
 
-        M = random_fullrank_matrix_distinct_singular_value(3, 3).to(device)
+        M = random_fullrank_matrix_distinct_singular_value(3, 3, dtype=dtype, device=device)
         run_test(M, sign=-1)
 
-        M = random_fullrank_matrix_distinct_singular_value(3, 2, 3).to(device)
+        M = random_fullrank_matrix_distinct_singular_value(3, 2, 3, dtype=dtype, device=device)
         run_test(M, sign=-1)
 
-    def test_chain_matmul(self, device):
+    @dtypes(torch.double)
+    def test_chain_matmul(self, device, dtype):
         def product(matrices):
             for mat in matrices[1:]:
                 matrices[0] = matrices[0].mm(mat)
             return matrices[0]
 
-        def run_test(p, device):
+        def run_test(p):
             matrices = []
             for (pi, pi_1) in zip(p[:-1], p[1:]):
-                matrices.append(torch.randn(pi, pi_1, device=device))
+                matrices.append(torch.randn(pi, pi_1, dtype=dtype, device=device))
             self.assertEqual(torch.chain_matmul(*matrices), product(matrices))
 
-        run_test([10, 20, 30, 5], device)
-        run_test([15, 5, 10, 20, 25], device)
+        run_test([10, 20, 30, 5])
+        run_test([15, 5, 10, 20, 25])
 
     @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_det_logdet_slogdet(self, device):
+    @dtypes(torch.double)
+    def test_det_logdet_slogdet(self, device, dtype):
         def reference_slogdet(M):
             if TEST_NUMPY:
                 sdet, logabsdet = np.linalg.slogdet(M.detach().cpu().numpy())
@@ -7349,8 +7362,8 @@ def test_single_det(M, target, desc):
             else:
                 self.assertEqual(logdet.exp(), target_logabsdet.exp(), 1e-7, '{} (logdet non-negative case)'.format(desc))
 
-        eye = torch.eye(5, device=device)
-        test_single_det(eye, (torch.ones((), device=device), torch.zeros((), device=device)), 'identity')
+        eye = torch.eye(5, dtype=dtype, device=device)
+        test_single_det(eye, (torch.ones((), dtype=dtype, device=device), torch.zeros((), dtype=dtype, device=device)), 'identity')
 
         def test(M):
             assert M.size(0) >= 5, 'this helper fn assumes M to be at least 5x5'
@@ -7442,23 +7455,23 @@ def get_random_mat_scale(n):
 
         for n in [5, 10, 25]:
             scale = get_random_mat_scale(n)
-            test(torch.randn(n, n, device=device) * scale)
-            r = torch.randn(n, n, device=device) * scale
+            test(torch.randn(n, n, dtype=dtype, device=device) * scale)
+            r = torch.randn(n, n, dtype=dtype, device=device) * scale
             # symmetric psd
             test(r.mm(r.t()))
             # symmetric pd
-            r = torch.randn(n, n, device=device) * scale
-            test(r.mm(r.t()) + torch.eye(n, device=device) * 1e-6)
+            r = torch.randn(n, n, dtype=dtype, device=device) * scale
+            test(r.mm(r.t()) + torch.eye(n, dtype=dtype, device=device) * 1e-6)
             # symmetric
-            r = torch.randn(n, n, device=device) * scale
+            r = torch.randn(n, n, dtype=dtype, device=device) * scale
             for i in range(n):
                 for j in range(i):
                     r[i, j] = r[j, i]
             test(r)
             # non-contiguous
-            test((torch.randn(n, n, n + 1, device=device) * scale)[:, 2, 1:])
+            test((torch.randn(n, n, n + 1, dtype=dtype, device=device) * scale)[:, 2, 1:])
             # det = 0
-            r = torch.randn(n, n, device=device) * scale
+            r = torch.randn(n, n, dtype=dtype, device=device) * scale
             u, s, v = r.svd()
             if reference_slogdet(u)[0] < 0:
                 u = -u
@@ -7470,14 +7483,15 @@ def get_random_mat_scale(n):
 
         # Small values to test numerical stability. Note that we don't scale
         # this matrix.
-        r = torch.randn(512, 512, device=device)
+        r = torch.randn(512, 512, dtype=dtype, device=device)
         u, s, v = r.svd()
         s.fill_(1. / (100 * s.numel()))
         test(u.mm(s.diag()).mm(v))
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_det_logdet_slogdet_batched(self, device):
+    @dtypes(torch.double)
+    def test_det_logdet_slogdet_batched(self, device, dtype):
         from common_utils import (random_symmetric_matrix, random_symmetric_psd_matrix,
                                   random_symmetric_pd_matrix, random_square_matrix_of_rank)
 
@@ -7490,15 +7504,15 @@ def run_test(matsize, batchdims, mat_chars):
             for idx in range(num_matrices):
                 mat_type = idx % len(mat_chars)
                 if mat_chars[mat_type] == 'sym':
-                    list_of_matrices.append(random_symmetric_matrix(matsize).to(device=device))
+                    list_of_matrices.append(random_symmetric_matrix(matsize, dtype=dtype, device=device))
                 elif mat_chars[mat_type] == 'sym_psd':
-                    list_of_matrices.append(random_symmetric_psd_matrix(matsize).to(device=device))
+                    list_of_matrices.append(random_symmetric_psd_matrix(matsize, dtype=dtype, device=device))
                 elif mat_chars[mat_type] == 'sym_pd':
-                    list_of_matrices.append(random_symmetric_pd_matrix(matsize).to(device=device))
+                    list_of_matrices.append(random_symmetric_pd_matrix(matsize, dtype=dtype, device=device))
                 elif mat_chars[mat_type] == 'sing':
-                    list_of_matrices.append(torch.ones(matsize, matsize, device=device))
+                    list_of_matrices.append(torch.ones(matsize, matsize, dtype=dtype, device=device))
                 elif mat_chars[mat_type] == 'non_sing':
-                    list_of_matrices.append(random_square_matrix_of_rank(matsize, matsize).to(device=device))
+                    list_of_matrices.append(random_square_matrix_of_rank(matsize, matsize, dtype=dtype, device=device))
             full_tensor = torch.stack(list_of_matrices, dim=0).reshape(batchdims + (matsize, matsize))
             # Scaling adapted from `get_random_mat_scale` in _test_det_logdet_slogdet
             full_tensor *= (math.factorial(matsize - 1) ** (-1.0 / (2 * matsize)))
@@ -7525,22 +7539,28 @@ def run_test(matsize, batchdims, mat_chars):
             run_test(matsize, batchdims, mat_chars=['sym', 'sym_pd', 'sym_psd'])
             run_test(matsize, batchdims, mat_chars=['sing', 'non_sing'])
 
+    def solve_test_helper(self, A_dims, b_dims, device, dtype):
+        from common_utils import random_fullrank_matrix_distinct_singular_value
+
+        b = torch.randn(*b_dims, dtype=dtype, device=device)
+        A = random_fullrank_matrix_distinct_singular_value(*A_dims, dtype=dtype, device=device)
+        return b, A
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_solve(self, device):
-        from common_utils import solve_test_helper
+    @dtypes(torch.double)
+    def test_solve(self, device, dtype):
         for (k, n) in zip([2, 3, 5], [3, 5, 7]):
-            b, A = solve_test_helper((n,), (n, k), lambda t: t.to(device))
+            b, A = self.solve_test_helper((n,), (n, k), device, dtype)
             x = torch.solve(b, A)[0]
             self.assertLessEqual(b.dist(A.mm(x)), 1e-12)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_solve_batched(self, device):
-        from common_utils import solve_test_helper
-
-        def solve_batch_helper(A_dims, b_dims, device):
-            b, A = solve_test_helper(A_dims, b_dims, lambda t: t.to(device))
+    @dtypes(torch.double)
+    def test_solve_batched(self, device, dtype):
+        def solve_batch_helper(A_dims, b_dims):
+            b, A = self.solve_test_helper(A_dims, b_dims, device, dtype)
             x_exp_list = []
             for i in range(b_dims[0]):
                 x_exp_list.append(torch.solve(b[i], A[i])[0])
@@ -7550,75 +7570,80 @@ def solve_batch_helper(A_dims, b_dims, device):
             self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 1e-12)  # Correctness check
 
         for batchsize in [1, 3, 4]:
-            solve_batch_helper((5, batchsize), (batchsize, 5, 10), device)
+            solve_batch_helper((5, batchsize), (batchsize, 5, 10))
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_solve_batched_non_contiguous(self, device):
+    @dtypes(torch.double)
+    def test_solve_batched_non_contiguous(self, device, dtype):
         from numpy.linalg import solve
         from common_utils import random_fullrank_matrix_distinct_singular_value
-        A = random_fullrank_matrix_distinct_singular_value(2, 2).to(device).permute(1, 0, 2)
-        b = torch.randn(2, 2, 2, device=device).permute(2, 1, 0)
+        A = random_fullrank_matrix_distinct_singular_value(2, 2, dtype=dtype,
+                                                           device=device).permute(1, 0, 2)
+        b = torch.randn(2, 2, 2, dtype=dtype, device=device).permute(2, 1, 0)
         x, _ = torch.solve(b, A)
-        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())).to(device)
+        x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())).to(dtype=dtype, device=device)
         self.assertEqual(x.data, x_exp)
 
     @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_solve_batched_many_batches(self, device):
-        from common_utils import solve_test_helper
-
-        b, A = solve_test_helper((5, 256, 256), (5, 1), lambda t: t.to(device))
+    @dtypes(torch.double)
+    def test_solve_batched_many_batches(self, device, dtype):
+        b, A = self.solve_test_helper((5, 256, 256), (5, 1), device, dtype)
         x, _ = torch.solve(b, A)
         self.assertEqual(torch.matmul(A, x), b.expand(A.shape[:-2] + (5, 1)))
 
-        b, A = solve_test_helper((3,), (512, 512, 3, 1), lambda t: t.to(device))
+        b, A = self.solve_test_helper((3,), (512, 512, 3, 1), device, dtype)
         x, _ = torch.solve(b, A)
         self.assertEqual(torch.matmul(A, x), b)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_solve_batched_broadcasting(self, device):
+    @dtypes(torch.double)
+    def test_solve_batched_broadcasting(self, device, dtype):
         from numpy.linalg import solve
-        from common_utils import solve_test_helper
 
-        def cast(t):
-            return t.to(device)
-
-        def run_test(A_dims, b_dims, cast):
+        def run_test(A_dims, b_dims):
             A_matrix_size = A_dims[-1]
             A_batch_dims = A_dims[:-2]
-            b, A = solve_test_helper((A_matrix_size,) + A_batch_dims, b_dims, cast)
+            b, A = self.solve_test_helper((A_matrix_size,) + A_batch_dims, b_dims, device, dtype)
             x, _ = torch.solve(b, A)
-            x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy()))
-            self.assertEqual(x, cast(x_exp))
+            x_exp = torch.Tensor(solve(A.cpu().numpy(), b.cpu().numpy())).to(dtype=dtype, device=device)
+            self.assertEqual(x, x_exp)
 
         # test against numpy.linalg.solve
         for upper in [True, False]:
-            run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6), cast)  # no broadcasting
-            run_test((2, 1, 3, 4, 4), (4, 6), cast)  # broadcasting b
-            run_test((4, 4), (2, 1, 3, 4, 2), cast)  # broadcasting A
-            run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5), cast)  # broadcasting A & b
+            run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6))  # no broadcasting
+            run_test((2, 1, 3, 4, 4), (4, 6))  # broadcasting b
+            run_test((4, 4), (2, 1, 3, 4, 2))  # broadcasting A
+            run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5))  # broadcasting A & b
+
+    def cholesky_solve_test_helper(self, A_dims, b_dims, upper, device, dtype):
+        from common_utils import random_symmetric_pd_matrix
+
+        b = torch.randn(*b_dims, dtype=dtype, device=device)
+        A = random_symmetric_pd_matrix(*A_dims, dtype=dtype, device=device)
+        L = torch.cholesky(A, upper=upper)
+        return b, A, L
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_cholesky_solve(self, device):
-        from common_utils import cholesky_solve_test_helper
+    @dtypes(torch.double)
+    def test_cholesky_solve(self, device, dtype):
         for (k, n), upper in product(zip([2, 3, 5], [3, 5, 7]), [True, False]):
-            b, A, L = cholesky_solve_test_helper((n,), (n, k), lambda t: t.to(device), upper)
+            b, A, L = self.cholesky_solve_test_helper((n,), (n, k), upper, device, dtype)
             x = torch.cholesky_solve(b, L, upper=upper)
             self.assertLessEqual(b.dist(A.mm(x)), 1e-12)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_cholesky_solve_batched(self, device):
-        from common_utils import cholesky_solve_test_helper
-
-        def cholesky_solve_batch_helper(A_dims, b_dims, cast, upper):
-            b, A, L = cholesky_solve_test_helper(A_dims, b_dims, cast, upper)
+    @dtypes(torch.double)
+    def test_cholesky_solve_batched(self, device, dtype):
+        def cholesky_solve_batch_helper(A_dims, b_dims, upper):
+            b, A, L = self.cholesky_solve_test_helper(A_dims, b_dims, upper, device, dtype)
             x_exp_list = []
             for i in range(b_dims[0]):
                 x_exp_list.append(torch.cholesky_solve(b[i], L[i], upper=upper))
@@ -7628,19 +7653,20 @@ def cholesky_solve_batch_helper(A_dims, b_dims, cast, upper):
             self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 2e-12)  # Correctness check
 
         for upper, batchsize in product([True, False], [1, 3, 4]):
-            cholesky_solve_batch_helper((5, batchsize), (batchsize, 5, 10), lambda t: t.to(device), upper)
+            cholesky_solve_batch_helper((5, batchsize), (batchsize, 5, 10), upper)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_cholesky_solve_batched_non_contiguous(self, device):
+    @dtypes(torch.double)
+    def test_cholesky_solve_batched_non_contiguous(self, device, dtype):
         from numpy.linalg import solve
         from common_utils import random_symmetric_pd_matrix
 
         for upper in [True, False]:
-            A = random_symmetric_pd_matrix(2, 2)
-            b = torch.randn(2, 2, 2)
-            x_exp = torch.Tensor(solve(A.permute(0, 2, 1).numpy(), b.permute(2, 1, 0).numpy())).to(device)
+            A = random_symmetric_pd_matrix(2, 2, dtype=dtype, device='cpu')
+            b = torch.randn(2, 2, 2, dtype=dtype, device='cpu')
+            x_exp = torch.Tensor(solve(A.permute(0, 2, 1).numpy(), b.permute(2, 1, 0).numpy())).to(dtype=dtype, device=device)
             A = A.to(device).permute(0, 2, 1)
             b = b.to(device).permute(2, 1, 0)
             assert not A.is_contiguous() and not b.is_contiguous(), "contiguous inputs"
@@ -7651,51 +7677,50 @@ def test_cholesky_solve_batched_non_contiguous(self, device):
     @slowTest
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_cholesky_solve_batched_many_batches(self, device):
-        from common_utils import cholesky_solve_test_helper
-
+    @dtypes(torch.double)
+    def test_cholesky_solve_batched_many_batches(self, device, dtype):
         for upper in [True, False]:
-            b, A, L = cholesky_solve_test_helper((5, 256, 256), (5, 10), lambda t: t.to(device), upper)
+            b, A, L = self.cholesky_solve_test_helper((5, 256, 256), (5, 10), upper, device, dtype)
             x = torch.cholesky_solve(b, L, upper)
             self.assertEqual(torch.matmul(A, x), b.expand(A.shape[:-2] + (5, 10)))
 
-            b, A, L = cholesky_solve_test_helper((5,), (512, 512, 5, 10), lambda t: t.to(device), upper)
+            b, A, L = self.cholesky_solve_test_helper((5,), (512, 512, 5, 10), upper, device, dtype)
             x = torch.cholesky_solve(b, L, upper)
             self.assertEqual(torch.matmul(A, x), b)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_cholesky_solve_batched_broadcasting(self, device):
+    @dtypes(torch.double)
+    def test_cholesky_solve_batched_broadcasting(self, device, dtype):
         from numpy.linalg import solve
         from common_utils import random_symmetric_pd_matrix
 
-        def cast(t):
-            return t.to(device)
-
-        def run_test(A_dims, b_dims, cast, upper):
+        def run_test(A_dims, b_dims, upper):
             A_matrix_size = A_dims[-1]
             A_batch_dims = A_dims[:-2]
-            A = random_symmetric_pd_matrix(A_matrix_size, *A_batch_dims)
-            b = torch.randn(*b_dims)
-            x_exp = torch.Tensor(solve(A.numpy(), b.numpy()))
-            A, b = cast(A), cast(b)
+            A = random_symmetric_pd_matrix(A_matrix_size, *A_batch_dims,
+                                           dtype=dtype, device='cpu')
+            b = torch.randn(*b_dims, dtype=dtype, device='cpu')
+            x_exp = torch.tensor(solve(A.numpy(), b.numpy()), dtype=dtype, device=device)
+            A, b = A.to(dtype=dtype, device=device), b.to(dtype=dtype, device=device)
             L = torch.cholesky(A, upper)
             x = torch.cholesky_solve(b, L, upper=upper)
-            self.assertEqual(x, cast(x_exp))
+            self.assertEqual(x, x_exp)
 
         # test against numpy.linalg.solve
         for upper in [True, False]:
-            run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6), cast, upper)  # no broadcasting
-            run_test((2, 1, 3, 4, 4), (4, 6), cast, upper)  # broadcasting b
-            run_test((4, 4), (2, 1, 3, 4, 2), cast, upper)  # broadcasting A
-            run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5), cast, upper)  # broadcasting A & b
+            run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6), upper)  # no broadcasting
+            run_test((2, 1, 3, 4, 4), (4, 6), upper)  # broadcasting b
+            run_test((4, 4), (2, 1, 3, 4, 2), upper)  # broadcasting A
+            run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5), upper)  # broadcasting A & b
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_cholesky_inverse(self, device):
+    @dtypes(torch.double)
+    def test_cholesky_inverse(self, device, dtype):
         from common_utils import random_symmetric_pd_matrix
-        a = random_symmetric_pd_matrix(5).to(device)
+        a = random_symmetric_pd_matrix(5, dtype=dtype, device=device)
 
         # compute inverse directly
         inv0 = torch.inverse(a)
@@ -7719,11 +7744,12 @@ def test_cholesky_inverse(self, device):
     @skipCUDAIf(True, "See issue #26789.")
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_cholesky_batched_many_batches(self, device):
+    @dtypes(torch.double)
+    def test_cholesky_batched_many_batches(self, device, dtype):
         from common_utils import random_symmetric_pd_matrix
 
         def cholesky_test_helper(n, batchsize, device, upper):
-            A = random_symmetric_pd_matrix(n, batchsize).to(device)
+            A = random_symmetric_pd_matrix(n, batchsize, dtype=dtype, device=device)
             chol_fact = torch.cholesky(A, upper=upper)
             if upper:
                 # Correctness check
@@ -7741,22 +7767,24 @@ def cholesky_test_helper(n, batchsize, device, upper):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_cholesky_batched(self, device):
+    @dtypes(torch.double)
+    def test_cholesky_batched(self, device, dtype):
         from common_utils import random_symmetric_pd_matrix
 
-        def cholesky_test_helper(n, batch_dims, device, upper):
-            A = random_symmetric_pd_matrix(n, *batch_dims).to(device)
+        def cholesky_test_helper(n, batch_dims, upper):
+            A = random_symmetric_pd_matrix(n, *batch_dims, dtype=dtype, device=device)
             cholesky_exp = torch.stack([m.cholesky(upper=upper) for m in A.reshape(-1, n, n)])
             cholesky_exp = cholesky_exp.reshape_as(A)
             self.assertEqual(cholesky_exp, torch.cholesky(A, upper=upper))
 
         for upper, batchsize in product([True, False], [(3,), (3, 4), (2, 3, 4)]):
-            cholesky_test_helper(3, batchsize, device, upper)
+            cholesky_test_helper(3, batchsize, upper)
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_cholesky(self, device):
-        x = torch.rand(10, 10, device=device) + 1e-1
+    @dtypes(torch.double)
+    def test_cholesky(self, device, dtype):
+        x = torch.rand(10, 10, dtype=dtype, device=device) + 1e-1
         A = torch.mm(x, x.t())
 
         # default Case
@@ -8326,13 +8354,15 @@ def delitem():
         self.assertRaises(TypeError, delitem)
 
     @skipCUDANonDefaultStreamIf(True)
-    def test_advancedindex(self, device):
+    @dtypes(torch.double)
+    @default_floating_dtype(torch.double)
+    def test_advancedindex(self, device, dtype):
         # Tests for Integer Array Indexing, Part I - Purely integer array
         # indexing
 
         def consec(size, start=1):
             numel = reduce(lambda x, y: x * y, size, 1)
-            sequence = torch.ones(numel).cumsum(0)
+            sequence = torch.ones(numel, dtype=dtype).cumsum(0)
             sequence.add_(start - 1)
             return sequence.view(*size)
 
@@ -8382,7 +8412,7 @@ def validate_setting(x):
 
         # strided is [1, 3, 5, 7]
         reference = consec((10,)).to(device)
-        strided = torch.Tensor().to(device)
+        strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(reference.storage(), storage_offset=0,
                      size=torch.Size([4]), stride=[2])
 
@@ -8395,7 +8425,7 @@ def validate_setting(x):
                          torch.Tensor([[5, 3], [1, 7]]))
 
         # stride is [4, 8]
-        strided = torch.Tensor().to(device)
+        strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(reference.storage(), storage_offset=4,
                      size=torch.Size([2]), stride=[4])
         self.assertEqual(strided[[0]], torch.Tensor([5]))
@@ -8442,8 +8472,8 @@ def validate_setting(x):
         reference[ri([0]), ri([1])] = -1
         self.assertEqual(reference[ri([0]), ri([1])], torch.Tensor([-1]))
         reference[ri([0, 1, 2]), ri([0])] = torch.Tensor([-1, 2, -4]).to(device)
-        self.assertEqual(reference[ri([0, 1, 2]), ri([0])], torch.Tensor([-1,
-                         2, -4]))
+        self.assertEqual(reference[ri([0, 1, 2]), ri([0])],
+                         torch.Tensor([-1, 2, -4]))
         reference[rows, columns] = torch.Tensor([[4, 6], [2, 3]]).to(device)
         self.assertEqual(reference[rows, columns],
                          torch.Tensor([[4, 6], [2, 3]]))
@@ -8452,7 +8482,7 @@ def validate_setting(x):
 
         reference = torch.Tensor([[0, 1, 2, 3],
                                   [4, 5, 6, 7],
-                                  [8, 9, 10, 11]]).to(device).t_()
+                                  [8, 9, 10, 11]]).to(dtype=dtype, device=device).t_()
 
         # Transposed: [[0, 4, 8],
         #              [1, 5, 9],
@@ -8504,8 +8534,8 @@ def validate_setting(x):
         # strided is [[1 3 5 7],
         #             [9 11 13 15]]
 
-        reference = torch.arange(0., 24).view(3, 8).to(device)
-        strided = torch.Tensor().to(device)
+        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(reference.storage(), 1, size=torch.Size([2, 4]),
                      stride=[8, 2])
 
@@ -8542,26 +8572,26 @@ def validate_setting(x):
         # strided is [[10, 11],
         #             [17, 18]]
 
-        reference = torch.arange(0., 24).view(3, 8).to(device)
-        strided = torch.Tensor().to(device)
+        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
                      stride=[7, 1])
         self.assertEqual(strided[ri([0]), ri([1])], torch.Tensor([11]))
         strided[ri([0]), ri([1])] = -1
         self.assertEqual(strided[ri([0]), ri([1])], torch.Tensor([-1]))
 
-        reference = torch.arange(0., 24).view(3, 8).to(device)
-        strided = torch.Tensor().to(device)
+        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
                      stride=[7, 1])
         self.assertEqual(strided[ri([0, 1]), ri([1, 0])], torch.Tensor([11,
                          17]))
-        strided[ri([0, 1]), ri([1, 0])] = torch.Tensor([-1, 2]).to(device)
+        strided[ri([0, 1]), ri([1, 0])] = torch.Tensor([-1, 2]).to(dtype=dtype, device=device)
         self.assertEqual(strided[ri([0, 1]), ri([1, 0])], torch.Tensor([-1,
                          2]))
 
-        reference = torch.arange(0., 24).view(3, 8).to(device)
-        strided = torch.Tensor().to(device)
+        reference = torch.arange(0., 24, dtype=dtype, device=device).view(3, 8)
+        strided = torch.tensor((), dtype=dtype, device=device)
         strided.set_(reference.storage(), 10, size=torch.Size([2, 2]),
                      stride=[7, 1])
 
@@ -8571,7 +8601,7 @@ def validate_setting(x):
                       [0, 1]])
         self.assertEqual(strided[rows, columns],
                          torch.Tensor([[10, 11], [17, 18]]))
-        strided[rows, columns] = torch.Tensor([[4, 6], [2, 3]]).to(device)
+        strided[rows, columns] = torch.Tensor([[4, 6], [2, 3]]).to(dtype=dtype, device=device)
         self.assertEqual(strided[rows, columns],
                          torch.Tensor([[4, 6], [2, 3]]))
 
@@ -8590,7 +8620,7 @@ def validate_setting(x):
             reference[ri([1]), ri([0, 2]), ri([3])]
 
         # test invalid index fails
-        reference = torch.empty(10, device=device)
+        reference = torch.empty(10, dtype=dtype, device=device)
         # can't test cuda because it is a device assert
         if not reference.is_cuda:
             for err_idx in (10, -11):
@@ -8608,8 +8638,7 @@ def validate_setting(x):
 
             def tensor_indices_to_np(tensor, indices):
                 # convert the Torch Tensor to a numpy array
-                if (tensor.is_cuda):
-                    tensor = tensor.cpu()
+                tensor = tensor.to(device='cpu')
                 npt = tensor.numpy()
 
                 # convert indices
@@ -8636,13 +8665,13 @@ def set_numpy(tensor, indices, value):
 
             def assert_get_eq(tensor, indexer):
                 self.assertEqual(tensor[indexer],
-                                 get_numpy(tensor, indexer).to(device))
+                                 get_numpy(tensor, indexer).to(dtype=dtype, device=device))
 
             def assert_set_eq(tensor, indexer, val):
                 pyt = tensor.clone()
                 numt = tensor.clone()
                 pyt[indexer] = val
-                numt = torch.Tensor(set_numpy(numt, indexer, val)).to(device)
+                numt = torch.Tensor(set_numpy(numt, indexer, val)).to(dtype=dtype, device=device)
                 self.assertEqual(pyt, numt)
 
             def assert_backward_eq(tensor, indexer):
@@ -8665,7 +8694,7 @@ def get_set_tensor(indexed, indexer):
             #            5  6  7  8  9
             #           10 11 12 13 14
             #           15 16 17 18 19
-            reference = torch.arange(0., 20).view(4, 5).to(device)
+            reference = torch.arange(0., 20, dtype=dtype, device=device).view(4, 5)
 
             indices_to_test = [
                 # grab the second, fourth columns
@@ -8697,7 +8726,7 @@ def get_set_tensor(indexed, indexer):
                               indexer,
                               get_set_tensor(reference, indexer))
 
-            reference = torch.arange(0., 160).view(4, 8, 5).to(device)
+            reference = torch.arange(0., 160, dtype=dtype, device=device).view(4, 8, 5)
 
             indices_to_test = [
                 [slice(None), slice(None), [0, 3, 4]],
@@ -8752,7 +8781,7 @@ def get_set_tensor(indexed, indexer):
                 if torch.cuda.is_available():
                     assert_backward_eq(reference, indexer)
 
-            reference = torch.arange(0., 1296).view(3, 9, 8, 6).to(device)
+            reference = torch.arange(0., 1296, dtype=dtype, device=device).view(3, 9, 8, 6)
 
             indices_to_test = [
                 [slice(None), slice(None), slice(None), [0, 3, 4]],
@@ -8834,14 +8863,15 @@ def get_set_tensor(indexed, indexer):
                     assert_backward_eq(reference, indexer)
 
     def test_advancedindex_big(self, device):
-        reference = torch.arange(0, 123344).int().to(device)
+        reference = torch.arange(0, 123344, dtype=torch.int, device=device)
 
         self.assertEqual(reference[[0, 123, 44488, 68807, 123343], ],
                          torch.LongTensor([0, 123, 44488, 68807, 123343]))
 
-    def test_kthvalue(self, device):
+    @dtypes(torch.double)
+    def test_kthvalue(self, device, dtype):
         SIZE = 50
-        x = torch.rand(SIZE, SIZE, SIZE, device=device)
+        x = torch.rand(SIZE, SIZE, SIZE, dtype=dtype, device=device)
         x0 = x.clone()
 
         k = random.randint(1, SIZE)
@@ -8852,7 +8882,7 @@ def test_kthvalue(self, device):
         self.assertEqual(res1ind[:, :], res2ind[:, :, k - 1], 0)
         # test use of result tensors
         k = random.randint(1, SIZE)
-        res1val = torch.tensor([], device=device)
+        res1val = torch.tensor([], dtype=dtype, device=device)
         res1ind = torch.tensor([], dtype=torch.long, device=device)
         torch.kthvalue(x, k, keepdim=False, out=(res1val, res1ind))
         res2val, res2ind = torch.sort(x)
@@ -8879,13 +8909,13 @@ def test_kthvalue(self, device):
         self.assertEqual(x, x0, 0)
 
         # simple test case (with repetitions)
-        y = torch.tensor((3., 5, 4, 1, 1, 5), device=device)
+        y = torch.tensor((3., 5, 4, 1, 1, 5), dtype=dtype, device=device)
         self.assertEqual(torch.kthvalue(y, 3)[0], 3, 0)
         self.assertEqual(torch.kthvalue(y, 2)[0], 1, 0)
 
         # simple test case (with NaN)
         SIZE = 50
-        x = torch.rand(SIZE, SIZE, SIZE, device=device)
+        x = torch.rand(SIZE, SIZE, SIZE, dtype=dtype, device=device)
         x[torch.arange(SIZE), :, torch.randint(50, (50,))] = nan
         ks = [random.randint(1, SIZE), 1, SIZE, SIZE - 1]
         res2val, res2ind = torch.sort(x)
@@ -8897,12 +8927,13 @@ def test_kthvalue(self, device):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_lu_solve_batched_non_contiguous(self, device):
+    @dtypes(torch.double)
+    def test_lu_solve_batched_non_contiguous(self, device, dtype):
         from numpy.linalg import solve
         from common_utils import random_fullrank_matrix_distinct_singular_value
 
-        A = random_fullrank_matrix_distinct_singular_value(2, 2)
-        b = torch.randn(2, 2, 2)
+        A = random_fullrank_matrix_distinct_singular_value(2, 2, dtype=dtype, device='cpu')
+        b = torch.randn(2, 2, 2, dtype=dtype, device='cpu')
         x_exp = torch.as_tensor(solve(A.permute(0, 2, 1).numpy(), b.permute(2, 1, 0).numpy())).to(device)
         A = A.to(device).permute(0, 2, 1)
         b = b.to(device).permute(2, 1, 0)
@@ -8911,47 +8942,95 @@ def test_lu_solve_batched_non_contiguous(self, device):
         x = torch.lu_solve(b, LU_data, LU_pivots)
         self.assertEqual(x, x_exp)
 
-    @slowTest
+    def lu_solve_test_helper(self, A_dims, b_dims, pivot, device, dtype):
+        from common_utils import random_fullrank_matrix_distinct_singular_value
+
+        b = torch.randn(*b_dims, dtype=dtype, device=device)
+        A = random_fullrank_matrix_distinct_singular_value(*A_dims, dtype=dtype, device=device)
+        LU_data, LU_pivots, info = torch.lu(A, get_infos=True, pivot=pivot)
+        self.assertEqual(info, torch.zeros_like(info))
+        return b, A, LU_data, LU_pivots
+
+    @skipCPUIfNoLapack
+    @skipCUDAIfNoMagma
+    @dtypes(torch.double)
+    def test_lu_solve(self, device, dtype):
+        def sub_test(pivot):
+            for k, n in zip([2, 3, 5], [3, 5, 7]):
+                b, A, LU_data, LU_pivots = self.lu_solve_test_helper((n,), (n, k), pivot, device, dtype)
+                x = torch.lu_solve(b, LU_data, LU_pivots)
+                self.assertLessEqual(b.dist(A.mm(x)), 1e-12)
+
+        sub_test(True)
+        if self.device_type == 'cuda':
+            sub_test(False)
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_lu_solve_batched_many_batches(self, device):
-        from common_utils import lu_solve_test_helper
+    @dtypes(torch.double)
+    def test_lu_solve_batched(self, device, dtype):
+        def sub_test(pivot):
+            def lu_solve_batch_test_helper(A_dims, b_dims, pivot):
+                b, A, LU_data, LU_pivots = self.lu_solve_test_helper(A_dims, b_dims, pivot, device, dtype)
+                x_exp_list = []
+                for i in range(b_dims[0]):
+                    x_exp_list.append(torch.lu_solve(b[i], LU_data[i], LU_pivots[i]))
+                x_exp = torch.stack(x_exp_list)  # Stacked output
+                x_act = torch.lu_solve(b, LU_data, LU_pivots)  # Actual output
+                self.assertEqual(x_exp, x_act)  # Equality check
+                self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 1e-12)  # Correctness check
+
+            for batchsize in [1, 3, 4]:
+                lu_solve_batch_test_helper((5, batchsize), (batchsize, 5, 10), pivot)
 
-        def cast(t):
-            return t.to(device)
+        # Tests tensors with 0 elements
+        b = torch.randn(3, 0, 3, dtype=dtype, device=device)
+        A = torch.randn(3, 0, 0, dtype=dtype, device=device)
+        LU_data, LU_pivots = torch.lu(A)
+        self.assertEqual(torch.empty_like(b), b.lu_solve(LU_data, LU_pivots))
+
+        sub_test(True)
+        if self.device_type == 'cuda':
+            sub_test(False)
 
-        def run_test(A_dims, b_dims, cast):
-            b, A, LU_data, LU_pivots = lu_solve_test_helper(self, A_dims, b_dims, cast, True)
+    @slowTest
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.double)
+    def test_lu_solve_batched_many_batches(self, device, dtype):
+        def run_test(A_dims, b_dims):
+            b, A, LU_data, LU_pivots = self.lu_solve_test_helper(A_dims, b_dims, True, device, dtype)
             x = torch.lu_solve(b, LU_data, LU_pivots)
             b_ = torch.matmul(A, x)
             self.assertEqual(b_, b.expand_as(b_))
 
-        run_test((5, 65536), (65536, 5, 10), cast)
-        run_test((5, 262144), (262144, 5, 10), cast)
+        run_test((5, 65536), (65536, 5, 10))
+        run_test((5, 262144), (262144, 5, 10))
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
-    def test_lu_solve_batched_broadcasting(self, device):
+    @dtypes(torch.double)
+    def test_lu_solve_batched_broadcasting(self, device, dtype):
         from numpy.linalg import solve
         from common_utils import random_fullrank_matrix_distinct_singular_value
 
-        def run_test(A_dims, b_dims, device, pivot=True):
+        def run_test(A_dims, b_dims, pivot=True):
             A_matrix_size = A_dims[-1]
             A_batch_dims = A_dims[:-2]
-            A = random_fullrank_matrix_distinct_singular_value(A_matrix_size, *A_batch_dims)
-            b = torch.randn(*b_dims)
-            x_exp = torch.as_tensor(solve(A.numpy(), b.numpy())).to(device)
+            A = random_fullrank_matrix_distinct_singular_value(A_matrix_size, *A_batch_dims, dtype=dtype)
+            b = torch.randn(*b_dims, dtype=dtype)
+            x_exp = torch.as_tensor(solve(A.numpy(), b.numpy())).to(dtype=dtype, device=device)
             A, b = A.to(device), b.to(device)
             LU_data, LU_pivots = torch.lu(A, pivot=pivot)
             x = torch.lu_solve(b, LU_data, LU_pivots)
             self.assertEqual(x, x_exp)
 
         # test against numpy.linalg.solve
-        run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6), device)  # no broadcasting
-        run_test((2, 1, 3, 4, 4), (4, 6), device)  # broadcasting b
-        run_test((4, 4), (2, 1, 3, 4, 2), device)  # broadcasting A
-        run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5), device)  # broadcasting A & b
+        run_test((2, 1, 3, 4, 4), (2, 1, 3, 4, 6))  # no broadcasting
+        run_test((2, 1, 3, 4, 4), (4, 6))  # broadcasting b
+        run_test((4, 4), (2, 1, 3, 4, 2))  # broadcasting A
+        run_test((1, 3, 1, 4, 4), (2, 1, 3, 4, 5))  # broadcasting A & b
 
     def test_dim_reduction(self, device):
         example = [[-1, 2, 1], [5, 3, 6]]
@@ -9100,13 +9179,14 @@ def test_rpow(self, device):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_symeig(self, device):
+    @dtypes(torch.double)
+    def test_symeig(self, device, dtype):
         from common_utils import random_symmetric_matrix
 
         def run_test(dims, eigenvectors, upper):
-            x = random_symmetric_matrix(*dims).to(device)
-            oute = torch.empty(dims[1:] + dims[:1], device=device)
-            outv = torch.empty(dims[1:] + dims[:1] * 2, device=device)
+            x = random_symmetric_matrix(*dims, dtype=dtype, device=device)
+            oute = torch.empty(dims[1:] + dims[:1], dtype=dtype, device=device)
+            outv = torch.empty(dims[1:] + dims[:1] * 2, dtype=dtype, device=device)
             torch.symeig(x, eigenvectors=eigenvectors, upper=upper, out=(oute, outv))
 
             if eigenvectors:
@@ -9122,7 +9202,7 @@ def run_test(dims, eigenvectors, upper):
             self.assertEqual(resv, outv, "outputs of symeig and symeig with out don't match")
 
             # test non-contiguous
-            x = random_symmetric_matrix(*dims).to(device)
+            x = random_symmetric_matrix(*dims, dtype=dtype, device=device)
             n_dim = len(dims) + 1
             # Reverse the batch dimensions and the matrix dimensions and then concat them
             x = x.permute(tuple(range(n_dim - 3, -1, -1)) + (n_dim - 1, n_dim - 2))
@@ -9142,10 +9222,13 @@ def run_test(dims, eigenvectors, upper):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_svd(self, device):
+    @dtypes(torch.double)
+    def test_svd(self, device, dtype):
         def run_test(dims, some, compute_uv):
-            x = torch.randn(*dims, device=device)
-            outu, outs, outv = torch.Tensor().to(device), torch.Tensor().to(device), torch.Tensor().to(device)
+            x = torch.randn(*dims, dtype=dtype, device=device)
+            outu = torch.tensor((), dtype=dtype, device=device)
+            outs = torch.tensor((), dtype=dtype, device=device)
+            outv = torch.tensor((), dtype=dtype, device=device)
             torch.svd(x, some=some, compute_uv=compute_uv, out=(outu, outs, outv))
 
             if compute_uv:
@@ -9169,7 +9252,7 @@ def run_test(dims, some, compute_uv):
             self.assertEqual(resv, outv, 'outputs of svd and svd with out differ')
 
             # test non-contiguous
-            x = torch.randn(*dims, device=device)
+            x = torch.randn(*dims, dtype=dtype, device=device)
             n_dim = len(dims)
             # Reverse the batch dimensions and the matrix dimensions and then concat them
             x = x.permute(tuple(range(n_dim - 3, -1, -1)) + (n_dim - 1, n_dim - 2))
@@ -9392,39 +9475,75 @@ def test_geqrf(self, device):
         self.assertEqual(b, b_placeholder)
         self.assertEqual(c, c_placeholder)
 
+    def triangular_solve_test_helper(self, A_dims, b_dims, upper, unitriangular,
+                                     device, dtype):
+        triangle_function = torch.triu if upper else torch.tril
+        b = torch.randn(*b_dims, dtype=dtype, device=device)
+        A = torch.randn(*A_dims, dtype=dtype, device=device)
+        A_triangular = triangle_function(A)
+        if unitriangular:
+            A_triangular.diagonal(dim1=-2, dim2=-1).fill_(1.)
+        return b, A_triangular
+
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_triangular_solve(self, device):
-        from common_utils import triangular_solve_test_helper
+    @dtypes(torch.double)
+    def test_triangular_solve(self, device, dtype):
         for (k, n), (upper, unitriangular, transpose) in product(zip([2, 3, 5], [3, 5, 7]),
                                                                  product([True, False], repeat=3)):
-            b, A = triangular_solve_test_helper((n, n), (n, k), lambda t: t.to(device), upper, unitriangular)
+            b, A = self.triangular_solve_test_helper((n, n), (n, k), upper,
+                                                     unitriangular, device, dtype)
             x = torch.triangular_solve(b, A, upper=upper, unitriangular=unitriangular, transpose=transpose)[0]
             if transpose:
                 self.assertLessEqual(b.dist(A.t().mm(x)), 4e-12)
             else:
                 self.assertLessEqual(b.dist(A.mm(x)), 4e-12)
 
-    @slowTest
-    @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_triangular_solve_batched_many_batches(self, device):
-        from common_utils import triangular_solve_test_helper
+    @skipCUDAIfNoMagma
+    @dtypes(torch.double)
+    def test_triangular_solve_batched(self, device, dtype):
+        def triangular_solve_batch_helper(A_dims, b_dims, upper, unitriangular, transpose):
+            b, A = self.triangular_solve_test_helper(A_dims, b_dims, upper,
+                                                     unitriangular, device, dtype)
+            x_exp_list = []
+            for i in range(b_dims[0]):
+                x_exp_list.append(torch.triangular_solve(b[i], A[i], upper=upper,
+                                                         unitriangular=unitriangular,
+                                                         transpose=transpose)[0])
+            x_exp = torch.stack(x_exp_list)  # Stacked output
+            x_act = torch.triangular_solve(b, A, upper=upper,
+                                           unitriangular=unitriangular,
+                                           transpose=transpose)[0]  # Actual output
+            self.assertEqual(x_act, x_exp)  # Equality check
+            if transpose:
+                self.assertLessEqual(b.dist(torch.matmul(A.transpose(-2, -1), x_act)), 3e-12)  # Correctness check
+            else:
+                self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 3e-12)  # Correctness check
 
-        def cast(t):
-            return t.to(device)
+        for (upper, unitriangular, transpose), batchsize in product(product([True, False], repeat=3), [1, 3, 4]):
+            triangular_solve_batch_helper((batchsize, 5, 5), (batchsize, 5, 10),
+                                          upper, unitriangular, transpose)
 
+
+    @slowTest
+    @skipCUDAIfNoMagma
+    @skipCPUIfNoLapack
+    @dtypes(torch.double)
+    def test_triangular_solve_batched_many_batches(self, device, dtype):
         for upper, transpose, unitriangular in product([True, False], repeat=3):
-            b, A = triangular_solve_test_helper((256, 256, 5, 5), (5, 1), cast, upper, unitriangular)
+            b, A = self.triangular_solve_test_helper((256, 256, 5, 5), (5, 1),
+                                                     upper, unitriangular, device, dtype)
             x, _ = torch.triangular_solve(b, A,
                                           upper=upper, transpose=transpose, unitriangular=unitriangular)
             if transpose:
                 A = A.transpose(-2, -1)
             self.assertEqual(torch.matmul(A, x), b.expand(A.shape[:-2] + (5, 1)))
 
-            b, A = triangular_solve_test_helper((3, 3), (512, 512, 3, 1), cast, upper, unitriangular)
-            x, _ = torch.triangular_solve(b, A,
-                                          upper=upper, transpose=transpose, unitriangular=unitriangular)
+            b, A = self.triangular_solve_test_helper((3, 3), (512, 512, 3, 1),
+                                                     upper, unitriangular, device, dtype)
+            x, _ = torch.triangular_solve(b, A, upper=upper, transpose=transpose,
+                                          unitriangular=unitriangular)
             if transpose:
                 A = A.transpose(-2, -1)
             self.assertEqual(torch.matmul(A, x), b)
@@ -9432,9 +9551,9 @@ def cast(t):
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
     @unittest.skipIf(not TEST_SCIPY, "SciPy not found")
-    def test_triangular_solve_batched_broadcasting(self, device):
+    @dtypes(torch.double)
+    def test_triangular_solve_batched_broadcasting(self, device, dtype):
         from scipy.linalg import solve_triangular as tri_solve
-        from common_utils import triangular_solve_test_helper
 
         def scipy_tri_solve_batched(A, B, upper, trans, diag):
             batch_dims_A, batch_dims_B = A.shape[:-2], B.shape[:-2]
@@ -9450,7 +9569,8 @@ def scipy_tri_solve_batched(A, B, upper, trans, diag):
             return flat_X.reshape(expand_B.shape)
 
         def run_test(A_dims, b_dims, device, upper, transpose, unitriangular):
-            b, A = triangular_solve_test_helper(A_dims, b_dims, lambda t: t.to(device), upper, unitriangular)
+            b, A = self.triangular_solve_test_helper(A_dims, b_dims, upper,
+                                                     unitriangular, device, dtype)
             x_exp = torch.as_tensor(scipy_tri_solve_batched(A.cpu().numpy(), b.cpu().numpy(),
                                                             upper, transpose, unitriangular))
             x = torch.triangular_solve(b, A, upper=upper, transpose=transpose, unitriangular=unitriangular)[0]
@@ -9466,13 +9586,15 @@ def run_test(A_dims, b_dims, device, upper, transpose, unitriangular):
 
     @skipCUDAIfNoMagma
     @skipCPUIfNoLapack
-    def test_lstsq(self, device):
-        def cast_fn(tensor):
-            return tensor.to(device=device)
+    @dtypes(torch.double)
+    @default_floating_dtype(torch.double)
+    def test_lstsq(self, device, dtype):
+        def cast_fn(t):
+            return t.to(dtype=dtype, device=device)
 
         def _test_underdetermined(a, b, expectedNorm):
-            # underdetermined systems are not supported on the GPU
-            if not torch.device(device).type == 'cpu':
+            # underdetermined systems are only supported on CPU
+            if not self.device_type == 'cpu':
                 return
 
             m = a.size()[0]
@@ -9486,8 +9608,8 @@ def _test_underdetermined(a, b, expectedNorm):
             self.assertEqual(b, b_copy, 0)
             self.assertEqual((torch.mm(a, res1) - b).norm(), expectedNorm, 1e-8)
 
-            ta = cast_fn(torch.Tensor())
-            tb = cast_fn(torch.Tensor())
+            ta = torch.tensor((), dtype=dtype, device=device)
+            tb = torch.tensor((), dtype=dtype, device=device)
             res2 = torch.lstsq(b, a, out=(tb, ta))[0]
             self.assertEqual(a, a_copy, 0)
             self.assertEqual(b, b_copy, 0)
@@ -9524,8 +9646,8 @@ def check_norm(a, b, expected_norm, gels_result):
             self.assertEqual(b, b_copy, 0)
             check_norm(a, b, expectedNorm, res1)
 
-            ta = cast_fn(torch.Tensor())
-            tb = cast_fn(torch.Tensor())
+            ta = torch.tensor((), dtype=dtype, device=device)
+            tb = torch.tensor((), dtype=dtype, device=device)
             res2 = torch.lstsq(b, a, out=(tb, ta))[0]
             self.assertEqual(a, a_copy, 0)
             self.assertEqual(b, b_copy, 0)
@@ -9577,8 +9699,8 @@ def check_norm(a, b, expected_norm, gels_result):
                                   (4.53, 3.83, -6.64, 2.06)))).t()
         b = cast_fn(torch.Tensor(((8.58, 8.26, 8.48, -5.28),
                                   (9.35, -4.43, -0.70, -0.26)))).t()
-        ta = cast_fn(torch.Tensor())
-        tb = cast_fn(torch.Tensor())
+        ta = torch.tensor((), dtype=dtype, device=device)
+        tb = torch.tensor((), dtype=dtype, device=device)
         torch.lstsq(b, a, out=(tb, ta))
         self.assertEqual((torch.mm(a, tb) - b).norm(), expectedNorm, 1e-8)
         torch.lstsq(b, a, out=(tb, ta))
@@ -11563,9 +11685,10 @@ def non_zero_rand(size, dtype, device):
                 non_zero_rand((2, 2), dtype=dtype, device=device))
 
     # TODO: run on non-native device types
-    def test_unary_out_op_mem_overlap(self, device):
+    @dtypes(torch.double)
+    def test_unary_out_op_mem_overlap(self, device, dtype):
         sz = 3
-        doubles = torch.randn(2 * sz, device=device)
+        doubles = torch.randn(2 * sz, dtype=dtype, device=device)
         positives = torch.randint(1, 100, (2 * sz,), device=device).double()
         ints = torch.randint(-100, 100, (2 * sz,), device=device)
         unary_mem_overlap_cases = [
@@ -11642,10 +11765,11 @@ def test_unary_out_op_mem_overlap(self, device):
             self.unary_check_input_output_mem_overlap(inputs, sz, out_fn,
                                                       expected_failure=not has_input_output_mem_overlap_check)
 
-            self.check_internal_mem_overlap(in_fn, num_inputs=1, device=dev,
+            self.check_internal_mem_overlap(in_fn, 1, dtype, dev,
                                             expected_failure=not has_internal_mem_overlap_check)
 
-    def test_binary_op_mem_overlap(self, device):
+    @dtypes(torch.double)
+    def test_binary_op_mem_overlap(self, device, dtype):
         ops = [
             ("add", True, True, 'cpu'),
             ("add", True, True, 'cuda'),
@@ -11666,13 +11790,14 @@ def test_binary_op_mem_overlap(self, device):
             out_op = getattr(torch, fn)
             inplace_op = getattr(torch.Tensor, fn + '_')
             self.check_internal_mem_overlap(
-                inplace_op, num_inputs=2, device=device,
+                inplace_op, 2, dtype, device,
                 expected_failure=not has_internal_mem_overlap_check)
 
             self.binary_check_input_output_mem_overlap(out_op, device,
                                                        expected_failure=not has_input_output_mem_overlap_check)
 
-    def test_ternary_op_mem_overlap(self, device):
+    @dtypes(torch.double)
+    def test_ternary_op_mem_overlap(self, device, dtype):
         ops = [
             ("addcmul", True, True, 'cpu'),
             ("addcmul", True, True, 'cuda'),
@@ -11689,24 +11814,26 @@ def test_ternary_op_mem_overlap(self, device):
             out_op = getattr(torch, fn)
             inplace_op = getattr(torch.Tensor, fn + '_')
             self.check_internal_mem_overlap(
-                inplace_op, num_inputs=3, device=device,
+                inplace_op, 3, dtype, device,
                 expected_failure=not has_internal_mem_overlap_check)
             self.ternary_check_input_output_mem_overlap(out_op, dev,
                                                         expected_failure=not has_input_output_mem_overlap_check)
 
-    def test_copy_mem_overlap(self, device):
+    @dtypes(torch.double)
+    def test_copy_mem_overlap(self, device, dtype):
         self.check_internal_mem_overlap(
-            torch.Tensor.copy_, num_inputs=2, device=device)
+            torch.Tensor.copy_, num_inputs=2, dtype=dtype, device=device)
         sz = 3
-        doubles = torch.randn(2 * sz, device=device)
+        doubles = torch.randn(2 * sz, dtype=dtype, device=device)
         self.unary_check_input_output_mem_overlap(
             doubles, sz, lambda input, out: out.copy_(input))
 
-    def test_pow_scalar_overloads_mem_overlap(self, device):
+    @dtypes(torch.double)
+    def test_pow_scalar_overloads_mem_overlap(self, device, dtype):
         sz = 3
-        doubles = torch.randn(2 * sz, device=device)
+        doubles = torch.randn(2 * sz, dtype=dtype, device=device)
         self.check_internal_mem_overlap(
-            lambda t: t.pow_(42), num_inputs=1, device=device)
+            lambda t: t.pow_(42), 1, dtype, device)
         self.unary_check_input_output_mem_overlap(
             doubles, sz, lambda input, out: torch.pow(input, 42, out=out))
         self.unary_check_input_output_mem_overlap(
@@ -11796,7 +11923,8 @@ def test_var_mean_some_dims(self, device):
     @skipCUDAIfRocm
     # stft -> rfft -> _fft -> _fft_with_size -> _fft_mkl
     @unittest.skipIf(not TEST_MKL, "PyTorch is built without MKL support")
-    def test_stft(self, device):
+    @dtypes(torch.double)
+    def test_stft(self, device, dtype):
         if not TEST_LIBROSA:
             raise unittest.SkipTest('librosa not found')
 
@@ -11819,9 +11947,9 @@ def librosa_stft(x, n_fft, hop_length, win_length, window, center):
 
         def _test(sizes, n_fft, hop_length=None, win_length=None, win_sizes=None,
                   center=True, expected_error=None):
-            x = torch.randn(*sizes, device=device)
+            x = torch.randn(*sizes, dtype=dtype, device=device)
             if win_sizes is not None:
-                window = torch.randn(*win_sizes, device=device)
+                window = torch.randn(*win_sizes, dtype=dtype, device=device)
             else:
                 window = None
             if expected_error is None:
@@ -12704,62 +12832,11 @@ def run_subtest(matrix_size, batches, device, pivot):
 
     @skipCPUIfNoLapack
     @skipCUDAIfNoMagma
-    def test_lu_solve(self, device, pivot=True):
-        from common_utils import lu_solve_test_helper
-
-        def cast(t):
-            return t.to(device)
-
-        def sub_test(pivot):
-            for k, n in zip([2, 3, 5], [3, 5, 7]):
-                b, A, LU_data, LU_pivots = lu_solve_test_helper(self, (n,), (n, k), cast, pivot)
-                x = torch.lu_solve(b, LU_data, LU_pivots)
-                self.assertLessEqual(b.dist(A.mm(x)), 1e-12)
-
-        sub_test(True)
-
-        if self.device_type == 'cuda':
-            sub_test(False)
-
-    @skipCUDAIfNoMagma
-    @skipCPUIfNoLapack
-    def test_lu_solve_batched(self, device):
-        from common_utils import lu_solve_test_helper
-
-        def cast(t):
-            return t.to(device)
-
-        def sub_test(pivot):
-            def lu_solve_batch_test_helper(A_dims, b_dims, cast, pivot):
-                b, A, LU_data, LU_pivots = lu_solve_test_helper(self, A_dims, b_dims, cast, pivot)
-                x_exp_list = []
-                for i in range(b_dims[0]):
-                    x_exp_list.append(torch.lu_solve(b[i], LU_data[i], LU_pivots[i]))
-                x_exp = torch.stack(x_exp_list)  # Stacked output
-                x_act = torch.lu_solve(b, LU_data, LU_pivots)  # Actual output
-                self.assertEqual(x_exp, x_act)  # Equality check
-                self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 1e-12)  # Correctness check
-
-            for batchsize in [1, 3, 4]:
-                lu_solve_batch_test_helper((5, batchsize), (batchsize, 5, 10), cast, pivot)
-
-        # Tests tensors with 0 elements
-        b = torch.randn(3, 0, 3, device=device)
-        A = torch.randn(3, 0, 0, device=device)
-        LU_data, LU_pivots = torch.lu(A)
-        self.assertEqual(torch.empty_like(b), b.lu_solve(LU_data, LU_pivots))
-
-        sub_test(True)
-
-        if self.device_type == 'cuda':
-            sub_test(False)
-
-    @skipCPUIfNoLapack
-    @skipCUDAIfNoMagma
-    def test_lu_unpack(self, device, pivot=True):
+    @dtypes(torch.double)
+    def test_lu_unpack(self, device, dtype):
         def run_test(pivot):
             for shape in ((3, 3), (5, 3, 3), (7, 3, 5, 5), (7, 5, 3, 3, 3)):
-                a = torch.randn(*shape, device=device)
+                a = torch.randn(*shape, dtype=dtype, device=device)
                 a_lu, p = torch.lu(a, pivot=pivot)
                 p_ref, l_ref, u_ref = torch.lu_unpack(a_lu, p)
                 self.assertEqual(p_ref.matmul(l_ref.matmul(u_ref)), a)
@@ -12783,33 +12860,6 @@ def test_min_with_inf(self, device, dtype):
         self.assertTrue(torch.all(torch.min(a, dim=1)[0] == (-inf)).item())
         self.assertTrue(torch.min(a).item() == -inf)
 
-    @skipCPUIfNoLapack
-    @skipCUDAIfNoMagma
-    def test_triangular_solve_batched(self, device):
-        from common_utils import triangular_solve_test_helper
-
-        def cast(t):
-            return t.to(device)
-
-        def triangular_solve_batch_helper(A_dims, b_dims, cast, upper, unitriangular, transpose):
-            b, A = triangular_solve_test_helper(A_dims, b_dims, cast, upper, unitriangular)
-            x_exp_list = []
-            for i in range(b_dims[0]):
-                x_exp_list.append(torch.triangular_solve(b[i], A[i], upper=upper,
-                                                         unitriangular=unitriangular, transpose=transpose)[0])
-            x_exp = torch.stack(x_exp_list)  # Stacked output
-            x_act = torch.triangular_solve(b, A, upper=upper,
-                                           unitriangular=unitriangular, transpose=transpose)[0]  # Actual output
-            self.assertEqual(x_act, x_exp)  # Equality check
-            if transpose:
-                self.assertLessEqual(b.dist(torch.matmul(A.transpose(-2, -1), x_act)), 3e-12)  # Correctness check
-            else:
-                self.assertLessEqual(b.dist(torch.matmul(A, x_act)), 3e-12)  # Correctness check
-
-        for (upper, unitriangular, transpose), batchsize in product(product([True, False], repeat=3), [1, 3, 4]):
-            triangular_solve_batch_helper((batchsize, 5, 5), (batchsize, 5, 10), cast,
-                                          upper, unitriangular, transpose)
-
     def test_bincount(self, device):
         # negative input throws
         with self.assertRaisesRegex(RuntimeError, '1-d non-negative integral'):
@@ -13029,21 +13079,23 @@ def test_linspace(self, device):
         b = torch.linspace(0, 10, 10)
         self.assertEqual(a, b.to(device))
 
-    def test_logspace(self, device):
-        a = torch.logspace(1, 10, 10, device=device)
-        b = torch.logspace(1, 10, 10)
+    @dtypes(torch.double)
+    def test_logspace(self, device, dtype):
+        a = torch.logspace(1, 10, 10, dtype=dtype, device=device)
+        b = torch.logspace(1, 10, 10, dtype=dtype, device='cpu')
         self.assertEqual(a, b.to(device))
 
         # Check non-default base=2
-        a = torch.logspace(1, 10, 10, 2, device=device)
-        b = torch.logspace(1, 10, 10, 2)
+        a = torch.logspace(1, 10, 10, 2, dtype=dtype, device=device)
+        b = torch.logspace(1, 10, 10, 2, dtype=dtype, device='cpu')
         self.assertEqual(a, b.to(device))
 
     # Note: ROCm fails when using float tensors
-    def test_polygamma(self, device):
-        cpu_tensor = torch.randn(10, 10, 10)
+    @dtypes(torch.double)
+    def test_polygamma(self, device, dtype):
+        cpu_tensor = torch.randn(10, 10, 10, dtype=dtype)
         device_tensor = cpu_tensor.to(device)
-        zeros = torch.zeros(10, 10, 10)
+        zeros = torch.zeros(10, 10, 10, dtype=dtype)
         for n in [0, 1]:
             cpu_out = cpu_tensor.polygamma(n)
             device_out = device_tensor.polygamma(n)
@@ -13051,10 +13103,11 @@ def test_polygamma(self, device):
             self.assertEqual(norm_errors, zeros)
 
     # Note: fails when using float tensors
-    def test_digamma(self, device):
-        cpu_tensor = torch.randn(10, 10, 10)
+    @dtypes(torch.double)
+    def test_digamma(self, device, dtype):
+        cpu_tensor = torch.randn(10, 10, 10, dtype=dtype)
         device_tensor = cpu_tensor.to(device)
-        zeros = torch.zeros(10, 10, 10)
+        zeros = torch.zeros(10, 10, 10, dtype=dtype)
         cpu_out = cpu_tensor.digamma()
         device_out = device_tensor.digamma()
         norm_errors = (device_out - cpu_out.to(device)) / device_out
@@ -13063,8 +13116,8 @@ def test_digamma(self, device):
         # Tests pole behavior
         cpu_tensor = torch.tensor([-0.999999994, -1.999999994, -2.0000000111,
                                    -100.99999994, -1931.99999994, 0.000000111,
-                                   -0.000000111, 0, -1, -2, -931])
-        expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan])
+                                   -0.000000111, 0, -1, -2, -931], dtype=dtype)
+        expected_errors = torch.tensor([0, 0, 0, 0, 0, 0, 0, nan, nan, nan, nan], dtype=dtype)
         device_tensor = cpu_tensor.to(device)
         cpu_out = cpu_tensor.digamma()
         device_out = device_tensor.digamma()
@@ -13110,8 +13163,9 @@ def test_arange(self, device, dtype):
         self.assertEqual(cpu_tensor, device_tensor)
 
     @skipCUDAIfRocm
-    def test_sum_noncontig(self, device):
-        x = torch.randn(1, 75, 57, 20, device=device).permute(0, 3, 1, 2)
+    @dtypes(torch.double)
+    def test_sum_noncontig(self, device, dtype):
+        x = torch.randn(1, 75, 57, 20, dtype=dtype, device=device).permute(0, 3, 1, 2)
         y = x.cpu()
         self.assertEqual(x.sum().cpu(), y.sum())
         self.assertEqual(x.sum(dim=(-1, -2)).cpu(), y.sum(dim=(-1, -2)))