From 5756169a54cf2137b3d4440dfaa2778542f75b99 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 30 Aug 2022 18:13:25 -0700
Subject: [PATCH 01/12] refactor: Refactor testing to use cosine similarity,
 remove redundancy models and restructuring

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 .../lowering/test_module_fallback_passes.cpp  |   2 +-
 tests/core/partitioning/BUILD                 |  16 ---
 .../test_fallback_graph_output.cpp            |  69 ----------
 tests/cpp/BUILD                               |  47 -------
 tests/cpp/test_compiled_modules.cpp           |  65 ---------
 tests/cpp/test_module_fallback.cpp            |  74 -----------
 tests/cpp/test_modules_as_engines.cpp         |  47 +------
 tests/cpp/test_multi_gpu_serde.cpp            |   6 +-
 .../cpp/test_multiple_registered_engines.cpp  |  68 ----------
 tests/py/api/custom_models.py                 |  27 ++++
 tests/py/api/test_e2e_behavior.py             |  97 --------------
 tests/py/api/test_embed_engines.py            |  60 +++++++++
 tests/py/api/test_models.py                   | 124 ++++++++++++++++++
 tests/py/api/test_module_fallback.py          |  53 ++++++++
 .../api/test_multiple_registered_engines.py   |  40 ++++++
 tests/py/api/test_operator_fallback.py        |  52 ++++++++
 tests/py/api/test_ts_backend.py               |  83 ++----------
 tests/py/api/utils.py                         |   9 ++
 tests/util/util.cpp                           |  13 ++
 tests/util/util.h                             |   2 +
 20 files changed, 402 insertions(+), 552 deletions(-)
 delete mode 100644 tests/core/partitioning/test_fallback_graph_output.cpp
 delete mode 100644 tests/cpp/test_compiled_modules.cpp
 delete mode 100644 tests/cpp/test_module_fallback.cpp
 delete mode 100644 tests/cpp/test_multiple_registered_engines.cpp
 create mode 100644 tests/py/api/custom_models.py
 create mode 100644 tests/py/api/test_embed_engines.py
 create mode 100644 tests/py/api/test_models.py
 create mode 100644 tests/py/api/test_module_fallback.py
 create mode 100644 tests/py/api/test_multiple_registered_engines.py
 create mode 100644 tests/py/api/test_operator_fallback.py
 create mode 100644 tests/py/api/utils.py

diff --git a/tests/core/lowering/test_module_fallback_passes.cpp b/tests/core/lowering/test_module_fallback_passes.cpp
index f11882df8b..e6eb098079 100644
--- a/tests/core/lowering/test_module_fallback_passes.cpp
+++ b/tests/core/lowering/test_module_fallback_passes.cpp
@@ -124,5 +124,5 @@ TEST(Lowering, LowerAndPartitionSimpleModuleFallbackCorrectly) {
   }
 
   auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-6));
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results, trt_results, 0.99));
 }
diff --git a/tests/core/partitioning/BUILD b/tests/core/partitioning/BUILD
index 83722b4271..5f90be2972 100644
--- a/tests/core/partitioning/BUILD
+++ b/tests/core/partitioning/BUILD
@@ -55,21 +55,6 @@ cc_test(
     }),
 )
 
-cc_test(
-    name = "test_fallback_graph_output",
-    srcs = ["test_fallback_graph_output.cpp"],
-    data = [
-        ":jit_models",
-    ],
-    deps = [
-        "//tests/util",
-        "@googletest//:gtest_main",
-    ] + select({
-        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
-        "//conditions:default": ["@libtorch//:libtorch"],
-    }),
-)
-
 cc_test(
     name = "test_loop_fallback",
     srcs = ["test_loop_fallback.cpp"],
@@ -104,7 +89,6 @@ test_suite(
     name = "partitioning_tests",
     tests = [
         ":test_conditionals",
-        ":test_fallback_graph_output",
         ":test_loading_model",
         ":test_loop_fallback",
         ":test_resolve_nontensor_inputs",
diff --git a/tests/core/partitioning/test_fallback_graph_output.cpp b/tests/core/partitioning/test_fallback_graph_output.cpp
deleted file mode 100644
index 98fc4e6128..0000000000
--- a/tests/core/partitioning/test_fallback_graph_output.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-#include <string>
-#include <unordered_set>
-#include "core/compiler.h"
-#include "gtest/gtest.h"
-#include "tests/util/util.h"
-#include "torch/script.h"
-
-#ifndef DISABLE_TEST_IN_CI
-
-TEST(Partitioning, ComputeResNet50FallbackGraphCorrectly) {
-  torch::jit::script::Module mod;
-  try {
-    mod = torch::jit::load("tests/modules/resnet50_traced.jit.pt");
-  } catch (const c10::Error& e) {
-    std::cerr << "error loading the model\n";
-    return;
-  }
-
-  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
-  std::vector<torch::jit::IValue> jit_inputs_ivalues;
-  std::vector<torch::jit::IValue> trt_inputs_ivalues;
-  for (auto in_shape : input_shapes) {
-    auto in = at::randint(5, in_shape, {at::kCUDA});
-    jit_inputs_ivalues.push_back(in.clone());
-    trt_inputs_ivalues.push_back(in.clone());
-  }
-
-  std::vector<torch_tensorrt::core::ir::Input> input_ranges{torch_tensorrt::core::ir::Input({1, 3, 224, 224})};
-
-  torch_tensorrt::core::CompileSpec cfg(input_ranges);
-  cfg.partition_info.enabled = true;
-  cfg.partition_info.forced_fallback_operators.push_back("aten::add");
-
-  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
-  auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg);
-  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-6));
-}
-
-TEST(Partitioning, ComputeMobileNetFallbackGraphCorrectly) {
-  torch::jit::script::Module mod;
-  try {
-    mod = torch::jit::load("tests/modules/mobilenet_v2_traced.jit.pt");
-  } catch (const c10::Error& e) {
-    std::cerr << "error loading the model\n";
-    return;
-  }
-
-  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
-  std::vector<torch::jit::IValue> jit_inputs_ivalues;
-  std::vector<torch::jit::IValue> trt_inputs_ivalues;
-  for (auto in_shape : input_shapes) {
-    auto in = at::randint(5, in_shape, {at::kCUDA});
-    jit_inputs_ivalues.push_back(in.clone());
-    trt_inputs_ivalues.push_back(in.clone());
-  }
-
-  std::vector<torch_tensorrt::core::ir::Input> input_ranges{torch_tensorrt::core::ir::Input({1, 3, 224, 224})};
-  auto g = mod.get_method("forward").graph();
-  torch_tensorrt::core::CompileSpec cfg(input_ranges);
-  cfg.partition_info.enabled = true;
-  cfg.partition_info.forced_fallback_operators.push_back("aten::hardtanh");
-
-  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
-  auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg);
-  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-6));
-}
-#endif
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
index 3d56682189..ea2c6ae752 100644
--- a/tests/cpp/BUILD
+++ b/tests/cpp/BUILD
@@ -13,12 +13,9 @@ test_suite(
     name = "api_tests",
     tests = [
         ":test_collections",
-        ":test_compiled_modules",
         ":test_default_input_types",
         ":test_example_tensors",
-        ":test_module_fallback",
         ":test_modules_as_engines",
-        ":test_multiple_registered_engines",
         ":test_runtime_thread_safety",
         ":test_serialization",
     ],
@@ -28,12 +25,9 @@ test_suite(
     name = "aarch64_api_tests",
     tests = [
         ":test_collections",
-        ":test_compiled_modules",
         ":test_default_input_types",
         ":test_example_tensors",
-        ":test_module_fallback",
         ":test_modules_as_engines",
-        ":test_multiple_registered_engines",
         ":test_runtime_thread_safety",
         ":test_serialization",
     ],
@@ -72,21 +66,6 @@ cc_test(
     ],
 )
 
-cc_test(
-    name = "test_multiple_registered_engines",
-    srcs = ["test_multiple_registered_engines.cpp"],
-    data = [
-        "//tests/modules:jit_models",
-    ],
-    deps = [
-        "//tests/util",
-        "@googletest//:gtest_main",
-    ] + select({
-        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
-        "//conditions:default": ["@libtorch//:libtorch"],
-    }),
-)
-
 cc_test(
     name = "test_modules_as_engines",
     timeout = "long",
@@ -110,21 +89,6 @@ cc_test(
     ],
 )
 
-cc_test(
-    name = "test_module_fallback",
-    srcs = ["test_module_fallback.cpp"],
-    data = [
-        "//tests/modules:jit_models",
-    ],
-    deps = [
-        "//tests/util",
-        "@googletest//:gtest_main",
-    ] + select({
-        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
-        "//conditions:default": ["@libtorch//:libtorch"],
-    }),
-)
-
 cc_test(
     name = "test_collections",
     srcs = ["test_collections.cpp"],
@@ -140,17 +104,6 @@ cc_test(
     }),
 )
 
-cc_test(
-    name = "test_compiled_modules",
-    srcs = ["test_compiled_modules.cpp"],
-    data = [
-        "//tests/modules:jit_models",
-    ],
-    deps = [
-        ":cpp_api_test",
-    ],
-)
-
 cc_test(
     name = "test_multi_gpu_serde",
     srcs = ["test_multi_gpu_serde.cpp"],
diff --git a/tests/cpp/test_compiled_modules.cpp b/tests/cpp/test_compiled_modules.cpp
deleted file mode 100644
index 595dd7044f..0000000000
--- a/tests/cpp/test_compiled_modules.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include "cpp_api_test.h"
-
-TEST_P(CppAPITests, CompiledModuleIsClose) {
-  std::vector<torch::jit::IValue> jit_inputs_ivalues;
-  std::vector<torch::jit::IValue> trt_inputs_ivalues;
-  std::vector<torch_tensorrt::Input> shapes;
-  for (uint64_t i = 0; i < input_shapes.size(); i++) {
-    auto in = at::randint(5, input_shapes[i], {at::kCUDA}).to(input_types[i]);
-    jit_inputs_ivalues.push_back(in.clone());
-    trt_inputs_ivalues.push_back(in.clone());
-    auto in_spec = torch_tensorrt::Input(input_shapes[i]);
-    in_spec.dtype = input_types[i];
-    shapes.push_back(in_spec);
-    std::cout << in_spec << std::endl;
-  }
-
-  torch::jit::IValue jit_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod, jit_inputs_ivalues);
-  std::vector<at::Tensor> jit_results;
-  if (jit_results_ivalues.isTuple()) {
-    auto tuple = jit_results_ivalues.toTuple();
-    for (auto t : tuple->elements()) {
-      jit_results.push_back(t.toTensor());
-    }
-  } else {
-    jit_results.push_back(jit_results_ivalues.toTensor());
-  }
-
-  auto spec = torch_tensorrt::ts::CompileSpec(shapes);
-  spec.truncate_long_and_double = true;
-
-  auto trt_mod = torch_tensorrt::ts::compile(mod, spec);
-  torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, trt_inputs_ivalues);
-  std::vector<at::Tensor> trt_results;
-  if (trt_results_ivalues.isTuple()) {
-    auto tuple = trt_results_ivalues.toTuple();
-    for (auto t : tuple->elements()) {
-      trt_results.push_back(t.toTensor());
-    }
-  } else {
-    trt_results.push_back(trt_results_ivalues.toTensor());
-  }
-
-  for (size_t i = 0; i < trt_results.size(); i++) {
-    ASSERT_TRUE(
-        torch_tensorrt::tests::util::almostEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i]), threshold));
-  }
-}
-
-#ifndef DISABLE_TEST_IN_CI
-
-INSTANTIATE_TEST_SUITE_P(
-    CompiledModuleForwardIsCloseSuite,
-    CppAPITests,
-    testing::Values(
-        PathAndInput({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/resnet50_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/mobilenet_v2_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/resnet18_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/resnet50_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 8e-3}),
-        PathAndInput({"tests/modules/bert_base_uncased_traced.jit.pt", {{1, 14}, {1, 14}}, {at::kInt, at::kInt}, 8e-2}),
-        PathAndInput({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 8e-2})));
-
-#endif
diff --git a/tests/cpp/test_module_fallback.cpp b/tests/cpp/test_module_fallback.cpp
deleted file mode 100644
index d1221cde4d..0000000000
--- a/tests/cpp/test_module_fallback.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#include <string>
-#include "gtest/gtest.h"
-#include "tests/util/util.h"
-#include "torch/script.h"
-#include "torch_tensorrt/torch_tensorrt.h"
-
-#ifndef DISABLE_TEST_IN_CI
-
-TEST(CppAPITest, ResNetModuleFallbacksCorrectly) {
-  torch::jit::script::Module mod;
-  try {
-    mod = torch::jit::load("tests/modules/resnet18_scripted.jit.pt");
-  } catch (const c10::Error& e) {
-    std::cerr << "error loading the model\n";
-    ASSERT_TRUE(false);
-  }
-
-  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
-  std::vector<torch::jit::IValue> jit_inputs_ivalues;
-  std::vector<torch::jit::IValue> trt_inputs_ivalues;
-  for (auto in_shape : input_shapes) {
-    auto in = at::randint(5, in_shape, {at::kCUDA});
-    jit_inputs_ivalues.push_back(in.clone());
-    trt_inputs_ivalues.push_back(in.clone());
-  }
-
-  torch_tensorrt::ts::CompileSpec cfg(input_shapes);
-  cfg.torch_executed_modules.push_back("torchvision.models.resnet.BasicBlock");
-
-  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
-  auto trt_mod = torch_tensorrt::ts::compile(mod, cfg);
-  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-6));
-}
-
-TEST(CppAPITest, MobileNetModuleFallbacksCorrectlyWithOneEngine) {
-  torch::jit::script::Module mod;
-  try {
-    mod = torch::jit::load("tests/modules/mobilenet_v2_scripted.jit.pt");
-  } catch (const c10::Error& e) {
-    std::cerr << "error loading the model\n";
-    ASSERT_TRUE(false);
-  }
-
-  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
-  std::vector<torch::jit::IValue> jit_inputs_ivalues;
-  std::vector<torch::jit::IValue> trt_inputs_ivalues;
-  for (auto in_shape : input_shapes) {
-    auto in = at::randint(5, in_shape, {at::kCUDA});
-    jit_inputs_ivalues.push_back(in.clone());
-    trt_inputs_ivalues.push_back(in.clone());
-  }
-
-  torch_tensorrt::ts::CompileSpec cfg(input_shapes);
-  cfg.min_block_size = 5;
-  cfg.torch_executed_modules.push_back("torchvision.models.mobilenetv2.ConvBNActivation");
-
-  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
-  auto trt_mod = torch_tensorrt::ts::compile(mod, cfg);
-
-  auto g = trt_mod.get_method("forward").graph();
-  auto nodes = g->block()->nodes();
-  std::size_t trt_count = 0;
-  for (const auto n : nodes) {
-    if (n->kind().toQualString() == std::string("tensorrt::execute_engine")) {
-      trt_count++;
-    }
-  }
-  ASSERT_TRUE(trt_count == 1);
-
-  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(jit_results, trt_results, 2e-6));
-}
-#endif
diff --git a/tests/cpp/test_modules_as_engines.cpp b/tests/cpp/test_modules_as_engines.cpp
index 4437b1218c..21670acdaf 100644
--- a/tests/cpp/test_modules_as_engines.cpp
+++ b/tests/cpp/test_modules_as_engines.cpp
@@ -15,40 +15,7 @@ TEST_P(CppAPITests, ModuleAsEngineIsClose) {
   auto trt_results = torch_tensorrt::tests::util::RunModuleForwardAsEngine(mod, inputs);
 
   ASSERT_TRUE(
-      torch_tensorrt::tests::util::almostEqual(jit_results[0], trt_results[0].reshape_as(jit_results[0]), threshold));
-}
-
-TEST_P(CppAPITests, ModuleToEngineToModuleIsClose) {
-  std::vector<at::Tensor> inputs;
-  std::vector<torch::jit::IValue> inputs_ivalues;
-  for (uint64_t i = 0; i < input_shapes.size(); i++) {
-    inputs.push_back(at::randint(5, input_shapes[i], {at::kCUDA}).to(input_types[i]));
-    inputs_ivalues.push_back(inputs[inputs.size() - 1].clone());
-  }
-
-  torch::jit::IValue jit_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod, inputs_ivalues);
-  std::vector<at::Tensor> jit_results;
-  jit_results.push_back(jit_results_ivalues.toTensor());
-
-  std::vector<c10::ArrayRef<int64_t>> input_ranges;
-  for (auto in : inputs) {
-    input_ranges.push_back(in.sizes());
-  }
-
-  auto compile_spec = torch_tensorrt::ts::CompileSpec({input_ranges});
-  int device_id = 0;
-  cudaGetDevice(&device_id);
-  compile_spec.device.device_type = torch_tensorrt::Device::DeviceType::kGPU;
-  compile_spec.device.gpu_id = device_id;
-  auto engine = torch_tensorrt::ts::convert_method_to_trt_engine(mod, "forward", input_ranges);
-  auto trt_mod = torch_tensorrt::ts::embed_engine_in_new_module(engine, compile_spec.device);
-
-  torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, inputs_ivalues);
-  std::vector<at::Tensor> trt_results;
-  trt_results.push_back(trt_results_ivalues.toTensor());
-
-  ASSERT_TRUE(
-      torch_tensorrt::tests::util::almostEqual(jit_results[0], trt_results[0].reshape_as(jit_results[0]), threshold));
+      torch_tensorrt::tests::util::cosineSimEqual(jit_results[0], trt_results[0].reshape_as(jit_results[0]), threshold));
 }
 
 #ifndef DISABLE_TEST_IN_CI
@@ -57,12 +24,8 @@ INSTANTIATE_TEST_SUITE_P(
     ModuleAsEngineForwardIsCloseSuite,
     CppAPITests,
     testing::Values(
-        PathAndInput({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/resnet50_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/mobilenet_v2_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/resnet18_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/resnet50_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
-        PathAndInput({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 1e-4}),
-        PathAndInput({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 8e-2})));
+        PathAndInput({"tests/modules/resnet50_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99}),
+        PathAndInput({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99}),
+        PathAndInput({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99}),
+        PathAndInput({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99})));
 #endif
diff --git a/tests/cpp/test_multi_gpu_serde.cpp b/tests/cpp/test_multi_gpu_serde.cpp
index 8672ae9517..0b3944125b 100644
--- a/tests/cpp/test_multi_gpu_serde.cpp
+++ b/tests/cpp/test_multi_gpu_serde.cpp
@@ -23,12 +23,12 @@ TEST_P(CppAPITests, CompiledModuleIsClose) {
   trt_results.push_back(trt_results_ivalues.toTensor());
 
   for (size_t i = 0; i < trt_results.size(); i++) {
-    ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
-        jit_results[i], trt_results[i].reshape_as(jit_results[i]).to(torch::Device("cuda:0")), 2e-5));
+    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(
+        jit_results[i], trt_results[i].reshape_as(jit_results[i]).to(torch::Device("cuda:0")), threshold));
   }
 }
 
 INSTANTIATE_TEST_SUITE_P(
     CompiledModuleForwardIsCloseSuite,
     CppAPITests,
-    testing::Values(PathAndInput({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5})));
+    testing::Values(PathAndInput({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99})));
diff --git a/tests/cpp/test_multiple_registered_engines.cpp b/tests/cpp/test_multiple_registered_engines.cpp
deleted file mode 100644
index 2746687f68..0000000000
--- a/tests/cpp/test_multiple_registered_engines.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <string>
-#include "gtest/gtest.h"
-#include "tests/util/util.h"
-#include "torch/script.h"
-#include "torch_tensorrt/torch_tensorrt.h"
-
-#ifndef DISABLE_TEST_IN_CI
-
-TEST(CppAPITest, CanRunMultipleEngines) {
-  torch::jit::script::Module mod1;
-  torch::jit::script::Module mod2;
-  try {
-    mod1 = torch::jit::load("tests/modules/resnet50_traced.jit.pt");
-    mod2 = torch::jit::load("tests/modules/resnet18_traced.jit.pt");
-  } catch (const c10::Error& e) {
-    std::cerr << "error loading the model\n";
-    return;
-  }
-
-  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
-
-  std::vector<torch::jit::IValue> jit1_inputs_ivalues;
-  std::vector<torch::jit::IValue> trt1_inputs_ivalues;
-  for (auto in_shape : input_shapes) {
-    auto in = at::randint(5, in_shape, {at::kCUDA});
-    jit1_inputs_ivalues.push_back(in.clone());
-    trt1_inputs_ivalues.push_back(in.clone());
-  }
-
-  std::vector<torch::jit::IValue> jit2_inputs_ivalues;
-  std::vector<torch::jit::IValue> trt2_inputs_ivalues;
-  for (auto in_shape : input_shapes) {
-    auto in = at::randint(5, in_shape, {at::kCUDA});
-    jit2_inputs_ivalues.push_back(in.clone());
-    trt2_inputs_ivalues.push_back(in.clone());
-  }
-
-  torch::jit::IValue jit1_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod1, jit1_inputs_ivalues);
-  std::vector<at::Tensor> jit1_results;
-  jit1_results.push_back(jit1_results_ivalues.toTensor());
-
-  torch::jit::IValue jit2_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod2, jit2_inputs_ivalues);
-  std::vector<at::Tensor> jit2_results;
-  jit2_results.push_back(jit2_results_ivalues.toTensor());
-
-  auto trt_mod1 = torch_tensorrt::ts::compile(mod1, input_shapes);
-  torch::jit::IValue trt1_results_ivalues =
-      torch_tensorrt::tests::util::RunModuleForward(trt_mod1, trt1_inputs_ivalues);
-  std::vector<at::Tensor> trt1_results;
-  trt1_results.push_back(trt1_results_ivalues.toTensor());
-
-  auto trt_mod2 = torch_tensorrt::ts::compile(mod2, input_shapes);
-  torch::jit::IValue trt2_results_ivalues =
-      torch_tensorrt::tests::util::RunModuleForward(trt_mod2, trt2_inputs_ivalues);
-  std::vector<at::Tensor> trt2_results;
-  trt2_results.push_back(trt2_results_ivalues.toTensor());
-
-  for (size_t i = 0; i < trt1_results.size(); i++) {
-    ASSERT_TRUE(
-        torch_tensorrt::tests::util::almostEqual(jit1_results[i], trt1_results[i].reshape_as(jit1_results[i]), 2e-5));
-  }
-
-  for (size_t i = 0; i < trt2_results.size(); i++) {
-    ASSERT_TRUE(
-        torch_tensorrt::tests::util::almostEqual(jit2_results[i], trt2_results[i].reshape_as(jit2_results[i]), 2e-5));
-  }
-}
-#endif
diff --git a/tests/py/api/custom_models.py b/tests/py/api/custom_models.py
new file mode 100644
index 0000000000..c6c0bb4c68
--- /dev/null
+++ b/tests/py/api/custom_models.py
@@ -0,0 +1,27 @@
+import torch
+from transformers import BertModel, BertTokenizer, BertConfig
+
+def BertModule():
+    model_name = "bert-base-uncased"
+    enc = BertTokenizer.from_pretrained(model_name)
+    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+    tokenized_text = enc.tokenize(text)
+    masked_index = 8
+    tokenized_text[masked_index] = "[MASK]"
+    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+    tokens_tensor = torch.tensor([indexed_tokens])
+    segments_tensors = torch.tensor([segments_ids])
+    config = BertConfig(
+        vocab_size_or_config_json_file=32000,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        torchscript=True,
+    )
+    model = BertModel(config)
+    model.eval()
+    model = BertModel.from_pretrained(model_name, torchscript=True)
+    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+    return traced_model
diff --git a/tests/py/api/test_e2e_behavior.py b/tests/py/api/test_e2e_behavior.py
index d1da3e0465..35cd3509dc 100644
--- a/tests/py/api/test_e2e_behavior.py
+++ b/tests/py/api/test_e2e_behavior.py
@@ -5,103 +5,6 @@
 import copy
 from typing import Dict
 
-
-class TestCompileHalf(unittest.TestCase):
-    def test_compile_script_half(self):
-        self.model = models.resnet18(pretrained=True).eval().to("cuda")
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-        self.scripted_model.half()
-
-        compile_spec = {
-            "inputs": [torchtrt.Input(shape=self.input.shape, dtype=torch.half)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.half},
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (
-            (trt_mod(self.input.half()) - self.scripted_model(self.input.half()))
-            .abs()
-            .max()
-        )
-        torchtrt.logging.log(torchtrt.logging.Level.Debug, "Max diff: " + str(same))
-        self.assertTrue(same < 3e-2)
-
-    def test_compile_script_half_by_default(self):
-        self.model = models.resnet18(pretrained=True).eval().to("cuda")
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-        self.scripted_model.half()
-
-        compile_spec = {
-            "inputs": [torchtrt.Input(shape=self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.float, torch.half},
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (
-            (trt_mod(self.input.half()) - self.scripted_model(self.input.half()))
-            .abs()
-            .max()
-        )
-        torchtrt.logging.log(torchtrt.logging.Level.Debug, "Max diff: " + str(same))
-        self.assertTrue(same < 3e-2)
-
-
-class TestFallbackToTorch(unittest.TestCase):
-    def test_fallback(self):
-        self.model = models.resnet18(pretrained=True).eval().to("cuda")
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-
-        compile_spec = {
-            "inputs": [torchtrt.Input(self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-                "allow_gpu_fallback": False,
-                "disable_tf32": False,
-            },
-            "require_full_compilation": False,
-            "torch_executed_ops": ["aten::max_pool2d"],
-            "min_block_size": 1,
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
-
-    def test_module_fallback(self):
-        self.model = models.resnet18(pretrained=True).eval().to("cuda")
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.scripted_model = torch.jit.script(self.model)
-
-        compile_spec = {
-            "inputs": [torchtrt.Input(self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-                "allow_gpu_fallback": False,
-                "disable_tf32": False,
-            },
-            "require_full_compilation": False,
-            "torch_executed_modules": ["torchvision.models.resnet.BasicBlock"],
-            "min_block_size": 1,
-        }
-
-        trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
-
-
 class TestInputTypeDefaultsFP32Model(unittest.TestCase):
     def test_input_use_default_fp32(self):
         self.model = models.resnet18(pretrained=True).eval().to("cuda")
diff --git a/tests/py/api/test_embed_engines.py b/tests/py/api/test_embed_engines.py
new file mode 100644
index 0000000000..133c4c6a50
--- /dev/null
+++ b/tests/py/api/test_embed_engines.py
@@ -0,0 +1,60 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+import timm
+import custom_models as cm
+from typing import Dict
+from utils import cosine_similarity, COSINE_THRESHOLD
+
+class TestModelToEngineToModel(unittest.TestCase):
+    def test_resnet50(self):
+        self.model = models.resnet50(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+        }
+
+        self.scripted_model = torch.jit.script(self.model)
+        trt_engine = torchtrt.ts.convert_method_to_trt_engine(self.scripted_model, "forward", **compile_spec)
+        trt_mod = torchtrt.ts.embed_engine_in_new_module(trt_engine)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+    def test_efficientnet_b0(self):
+        self.model = timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+        }
+
+        self.scripted_model = torch.jit.script(self.model)
+        trt_engine = torchtrt.ts.convert_method_to_trt_engine(self.scripted_model, "forward", **compile_spec)
+        trt_mod = torchtrt.ts.embed_engine_in_new_module(trt_engine)
+
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/api/test_models.py b/tests/py/api/test_models.py
new file mode 100644
index 0000000000..84860b9305
--- /dev/null
+++ b/tests/py/api/test_models.py
@@ -0,0 +1,124 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+import timm
+import custom_models as cm
+from typing import Dict
+from utils import cosine_similarity, COSINE_THRESHOLD
+
+class TestModels(unittest.TestCase):
+    def test_resnet50(self):
+        self.model = models.resnet50(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+        }
+
+        trt_mod = torchtrt.compile(self.model, **compile_spec)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+    def test_mobilenet_v2(self):
+        self.model = models.mobilenet_v2(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+        }
+
+        trt_mod = torchtrt.compile(self.model, **compile_spec)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+    def test_efficientnet_b0(self):
+        self.model = timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+        }
+
+        trt_mod = torchtrt.compile(self.model, **compile_spec)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+    def test_bert_base_uncased(self):
+        self.model = cm.BertModule().cuda()
+        self.input = torch.randint(0, 5, (1, 14), dtype=torch.int32).to("cuda")
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(self.input.shape, dtype=self.input.dtype, format=torch.contiguous_format),
+                torchtrt.Input(self.input.shape, dtype=self.input.dtype, format=torch.contiguous_format)
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+            "truncate_long_and_double": True,
+        }
+        with torchtrt.logging.errors():
+            trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+
+        model_outputs = self.model(self.input, self.input)
+        trt_model_outputs = trt_mod(self.input, self.input)
+        for out, trt_out in zip(model_outputs, trt_model_outputs):
+            cos_sim = cosine_similarity(out, trt_out)
+            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+    def test_resnet50_half(self):
+        self.model = models.resnet50(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.scripted_model = torch.jit.script(self.model)
+        self.scripted_model.half()
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.half, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.half},
+        }
+
+        trt_mod = torchtrt.compile(self.scripted_model, **compile_spec)
+        cos_sim = cosine_similarity(self.model.half()(self.input.half()), trt_mod(self.input.half()))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/api/test_module_fallback.py b/tests/py/api/test_module_fallback.py
new file mode 100644
index 0000000000..5d5fc425c2
--- /dev/null
+++ b/tests/py/api/test_module_fallback.py
@@ -0,0 +1,53 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+from typing import Dict
+from utils import cosine_similarity, COSINE_THRESHOLD
+
+class TestModuleFallback(unittest.TestCase):
+    def test_fallback_resnet18(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+            "torch_executed_modules": ["torchvision.models.resnet.BasicBlock"],
+        }
+        trt_mod = torchtrt.compile(self.model, **compile_spec)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+    def test_fallback_mobilenet_v2(self):
+        self.model = models.mobilenet_v2(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+            "torch_executed_modules": ["torchvision.models.mobilenetv2.ConvBNActivation"],
+            "min_block_size": 5,
+        }
+        trt_mod = torchtrt.compile(self.model, **compile_spec)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Mobilenet V2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/api/test_multiple_registered_engines.py b/tests/py/api/test_multiple_registered_engines.py
new file mode 100644
index 0000000000..fb201f9d8f
--- /dev/null
+++ b/tests/py/api/test_multiple_registered_engines.py
@@ -0,0 +1,40 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+import timm
+import custom_models as cm
+from typing import Dict
+from utils import cosine_similarity, COSINE_THRESHOLD
+
+class TestModelToEngineToModel(unittest.TestCase):
+    def test_multiple_engines(self):
+        self.resnet18 = models.resnet18(pretrained=True).eval().to("cuda")
+        self.resnet50 = models.resnet50(pretrained=True).eval().to("cuda")
+        self.input1 = torch.randn((1, 3, 224, 224)).to("cuda")
+        self.input2 = torch.randn((1, 3, 224, 224)).to("cuda")
+
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input1.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+        }
+        rn18_trt_mod = torchtrt.compile(self.resnet18, **compile_spec)
+        rn50_trt_mod = torchtrt.compile(self.resnet50, **compile_spec)
+
+        cos_sim = cosine_similarity(self.resnet18(self.input1), rn18_trt_mod(self.input1))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+        cos_sim = cosine_similarity(self.resnet50(self.input1), rn50_trt_mod(self.input1))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/api/test_operator_fallback.py b/tests/py/api/test_operator_fallback.py
new file mode 100644
index 0000000000..25d1b7cd92
--- /dev/null
+++ b/tests/py/api/test_operator_fallback.py
@@ -0,0 +1,52 @@
+import unittest
+import torch_tensorrt as torchtrt
+import torch
+import torchvision.models as models
+import copy
+from typing import Dict
+from utils import cosine_similarity, COSINE_THRESHOLD
+
+class TestFallbackModels(unittest.TestCase):
+    def test_fallback_resnet18(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+            "torch_executed_ops": ["aten::add"],
+        }
+        trt_mod = torchtrt.compile(self.model, **compile_spec)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+    def test_fallback_mobilenet_v2(self):
+        self.model = models.mobilenet_v2(pretrained=True).eval().to("cuda")
+        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
+        compile_spec = {
+            "inputs": [
+                torchtrt.Input(
+                    self.input.shape, dtype=torch.float, format=torch.contiguous_format
+                )
+            ],
+            "device": {
+                "device_type": torchtrt.DeviceType.GPU,
+                "gpu_id": 0,
+            },
+            "enabled_precisions": {torch.float},
+            "torch_executed_ops": ["aten::hardtanh"],
+        }
+        trt_mod = torchtrt.compile(self.model, **compile_spec)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Mobilenet V2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/py/api/test_ts_backend.py b/tests/py/api/test_ts_backend.py
index d0654a8f75..891f4ba178 100644
--- a/tests/py/api/test_ts_backend.py
+++ b/tests/py/api/test_ts_backend.py
@@ -4,7 +4,7 @@
 import torchvision.models as models
 import copy
 from typing import Dict
-
+from utils import cosine_similarity, COSINE_THRESHOLD
 
 class TestCompile(unittest.TestCase):
     def test_compile_traced(self):
@@ -26,8 +26,8 @@ def test_compile_traced(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_compile_script(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -40,8 +40,8 @@ def test_compile_script(self):
                 device=torchtrt.Device(gpu_id=0),
                 enabled_precisions={torch.float},
             )
-            same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-            self.assertTrue(same < 2e-2)
+            cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_compile_global(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -53,21 +53,8 @@ def test_compile_global(self):
             device=torchtrt.Device(gpu_id=0),
             enabled_precisions={torch.float},
         )
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-    def test_compile_global_nn_mod(self):
-        self.model = models.vgg16(pretrained=True).eval().to("cuda")
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        with torch.no_grad():
-            trt_mod = torchtrt.compile(
-                self.model,
-                inputs=[self.input],
-                device=torchtrt.Device(gpu_id=0),
-                enabled_precisions={torch.float},
-            )
-            same = (trt_mod(self.input) - self.model(self.input)).abs().max()
-            self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_from_torch_tensor(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -83,8 +70,8 @@ def test_from_torch_tensor(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_device(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -97,8 +84,8 @@ def test_device(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_default_device(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -107,52 +94,8 @@ def test_default_device(self):
         compile_spec = {"inputs": [self.input], "enabled_precisions": {torch.float}}
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-    def test_compile_script_from_dict(self):
-        self.model = models.vgg16(pretrained=True).eval().to("cuda")
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.traced_model = torch.jit.trace(self.model, [self.input])
-        compile_spec = {
-            "inputs": [torchtrt.Input(shape=self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-            },
-            "enabled_precisions": {torch.float},
-        }
-
-        trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
-
-
-class TestPTtoTRTtoPT(unittest.TestCase):
-    def test_pt_to_trt_to_pt(self):
-        self.model = models.vgg16(pretrained=True).eval().to("cuda")
-        self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.ts_model = torch.jit.trace(self.model, [self.input])
-
-        compile_spec = {
-            "inputs": [torchtrt.Input(self.input.shape)],
-            "device": {
-                "device_type": torchtrt.DeviceType.GPU,
-                "gpu_id": 0,
-                "allow_gpu_fallback": False,
-                "disable_tf32": False,
-            },
-        }
-
-        trt_engine = torchtrt.ts.convert_method_to_trt_engine(
-            self.ts_model, "forward", **compile_spec
-        )
-        trt_mod = torchtrt.ts.embed_engine_in_new_module(
-            trt_engine, torchtrt.Device("cuda:0")
-        )
-        same = (trt_mod(self.input) - self.ts_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
-
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 class TestCheckMethodOpSupport(unittest.TestCase):
     def test_check_support(self):
diff --git a/tests/py/api/utils.py b/tests/py/api/utils.py
new file mode 100644
index 0000000000..e71bb09c6d
--- /dev/null
+++ b/tests/py/api/utils.py
@@ -0,0 +1,9 @@
+import torch
+
+COSINE_THRESHOLD=0.99
+
+def cosine_similarity(gt_tensor, pred_tensor):
+    res = torch.nn.functional.cosine_similarity(gt_tensor.flatten().to(torch.float32), pred_tensor.flatten().to(torch.float32), dim=0, eps=1e-6)
+    res = res.cpu().detach().item()
+
+    return res
diff --git a/tests/util/util.cpp b/tests/util/util.cpp
index 13d0d18566..91004c06ff 100644
--- a/tests/util/util.cpp
+++ b/tests/util/util.cpp
@@ -1,10 +1,23 @@
 #include "core/util/prelude.h"
 #include "torch/script.h"
+#include "torch/torch.h"
 
 namespace torch_tensorrt {
 namespace tests {
 namespace util {
 
+bool cosineSimEqual(const at::Tensor& computed_tensor, const at::Tensor& gt_tensor, float threshold = 0.99f){
+
+  torch::Tensor cosine_sim = torch::nn::functional::cosine_similarity(computed_tensor.flatten(), gt_tensor.flatten(), torch::nn::functional::CosineSimilarityFuncOptions().dim(0));
+  std::ostringstream ss;
+  ss << computed_tensor << std::endl << gt_tensor << std::endl;
+  LOG_GRAPH(ss.str());
+  LOG_GRAPH(std::string("Cosine Similarity score: ") + std::to_string(cosine_sim.item<float>()));
+  LOG_GRAPH(std::string("Acceptable Threshold: ") + std::to_string(threshold));
+
+  return cosine_sim.item<float>() >= threshold;
+}
+
 bool almostEqual(const at::Tensor& computed_tensor, const at::Tensor& gt_tensor, float atol = 1e-8, float rtol = 1e-5) {
   std::ostringstream ss;
   ss << computed_tensor << std::endl << gt_tensor << std::endl;
diff --git a/tests/util/util.h b/tests/util/util.h
index f39e2a5766..1ea62a16e0 100644
--- a/tests/util/util.h
+++ b/tests/util/util.h
@@ -11,6 +11,8 @@ namespace torch_tensorrt {
 namespace tests {
 namespace util {
 
+bool cosineSimEqual(const at::Tensor& computed_tensor, const at::Tensor& gt_tensor, float threshold);
+
 bool almostEqual(const at::Tensor& computed_tensor, const at::Tensor& gt_tensor, float atol = 1e-8, float rtol = 1e-5);
 
 bool exactlyEqual(const at::Tensor& a, const at::Tensor& b);

From c6f3103cd3295f3be5c37e349ac5aa0a809bacb4 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Wed, 31 Aug 2022 11:18:53 -0700
Subject: [PATCH 02/12] chore: move to cosine similarity comparison

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 tests/cpp/test_collections.cpp   | 10 ++++----
 tests/py/api/test_collections.py | 41 +++++++++++++-------------------
 tests/py/api/utils.py            |  7 +++++-
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp
index d01665adcd..3318aec99d 100644
--- a/tests/cpp/test_collections.cpp
+++ b/tests/cpp/test_collections.cpp
@@ -42,7 +42,7 @@ TEST(CppAPITests, TestCollectionStandardTensorInput) {
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   auto trt_out = trt_mod.forward(inputs_);
 
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(out.toTensor(), trt_out.toTensor(), 0.99));
 }
 
 TEST(CppAPITests, TestCollectionTupleInput) {
@@ -85,7 +85,7 @@ TEST(CppAPITests, TestCollectionTupleInput) {
   auto trt_mod = torch_tensorrt::torchscript::compile(mod, compile_settings);
   auto trt_out = trt_mod.forward(complex_inputs);
 
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(out.toTensor(), trt_out.toTensor(), 0.99));
 }
 
 TEST(CppAPITests, TestCollectionListInput) {
@@ -144,7 +144,7 @@ TEST(CppAPITests, TestCollectionListInput) {
   LOG_DEBUG("Finish compile");
   auto trt_out = trt_mod.forward(complex_inputs);
 
-  ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(out.toTensor(), trt_out.toTensor(), 1e-5));
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(out.toTensor(), trt_out.toTensor(), 0.99));
 }
 
 TEST(CppAPITests, TestCollectionTupleInputOutput) {
@@ -192,7 +192,7 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   auto trt_out = trt_mod.forward(complex_inputs);
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
-      out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+        out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
       out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
 }
@@ -317,4 +317,4 @@ TEST(CppAPITests, TestCollectionComplexModel) {
       out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
       out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
-}
\ No newline at end of file
+}
diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py
index dfae3f18c9..88147e005e 100644
--- a/tests/py/api/test_collections.py
+++ b/tests/py/api/test_collections.py
@@ -3,6 +3,7 @@
 import torch
 import torchvision.models as models
 import os
+from utils import cosine_similarity, COSINE_THRESHOLD
 
 
 def find_repo_root(max_depth=10):
@@ -40,12 +41,8 @@ def test_compile(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        same = (
-            (trt_mod(self.input, self.input) - self.model(self.input, self.input))
-            .abs()
-            .max()
-        )
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model(self.input, self.input), trt_mod(self.input, self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"standard_tensor_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 class TestTupleInput(unittest.TestCase):
@@ -68,12 +65,8 @@ def test_compile(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        same = (
-            (trt_mod((self.input, self.input)) - self.model((self.input, self.input)))
-            .abs()
-            .max()
-        )
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model((self.input, self.input)), trt_mod((self.input, self.input)))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"tuple_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 class TestListInput(unittest.TestCase):
@@ -94,12 +87,8 @@ def test_compile(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        same = (
-            (trt_mod([self.input, self.input]) - self.model([self.input, self.input]))
-            .abs()
-            .max()
-        )
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model([self.input, self.input]), trt_mod([self.input, self.input]))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"list_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 class TestTupleInputOutput(unittest.TestCase):
@@ -124,8 +113,9 @@ def test_compile(self):
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
         trt_out = trt_mod((self.input, self.input))
         pyt_out = self.model((self.input, self.input))
-        results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)]
-        self.assertTrue(all(results))
+        for (t, p) in zip(trt_out, pyt_out):
+            cos_sim = cosine_similarity(t, p)
+            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"tuple_input_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 class TestListInputOutput(unittest.TestCase):
@@ -150,8 +140,10 @@ def test_compile(self):
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
         trt_out = trt_mod((self.input, self.input))
         pyt_out = self.model((self.input, self.input))
-        results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)]
-        self.assertTrue(all(results))
+
+        for (t, p) in zip(trt_out, pyt_out):
+            cos_sim = cosine_similarity(t, p)
+            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"list_input_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 class TestListInputTupleOutput(unittest.TestCase):
@@ -176,8 +168,9 @@ def test_compile(self):
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
         trt_out = trt_mod((self.input, self.input))
         pyt_out = self.model((self.input, self.input))
-        results = [(t - p).abs().max() < 2e-2 for (t, p) in zip(trt_out, pyt_out)]
-        self.assertTrue(all(results))
+        for (t, p) in zip(trt_out, pyt_out):
+            cos_sim = cosine_similarity(t, p)
+            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"list_input_tuple_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 if __name__ == "__main__":
diff --git a/tests/py/api/utils.py b/tests/py/api/utils.py
index e71bb09c6d..a43b54a4a7 100644
--- a/tests/py/api/utils.py
+++ b/tests/py/api/utils.py
@@ -3,7 +3,12 @@
 COSINE_THRESHOLD=0.99
 
 def cosine_similarity(gt_tensor, pred_tensor):
-    res = torch.nn.functional.cosine_similarity(gt_tensor.flatten().to(torch.float32), pred_tensor.flatten().to(torch.float32), dim=0, eps=1e-6)
+    gt_tensor = gt_tensor.flatten().to(torch.float32)
+    pred_tensor = pred_tensor.flatten().to(torch.float32)
+    if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
+        if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
+            return 1.0
+    res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
     res = res.cpu().detach().item()
 
     return res

From beeac7cd39761cd4919652f6659f0cb659b29812 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Wed, 31 Aug 2022 17:18:13 -0700
Subject: [PATCH 03/12] refactor: Refactor nox file testing

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 .circleci/config.yml                          |   1 +
 noxfile.py                                    | 101 +++++++-----------
 py/torch_tensorrt/ts/_compile_spec.py         |   6 +-
 tests/py/api/test_embed_engines.py            |   1 -
 tests/py/hw/test_api_dla.py                   |   9 +-
 tests/py/hw/test_multi_gpu.py                 |  17 +--
 tests/py/integrations/test_to_backend_api.py  |   9 +-
 .../test_trt_intercompatibility.py            |   6 +-
 tests/py/{api => models}/custom_models.py     |   0
 tests/py/{api => models}/test_models.py       |   0
 .../test_multiple_registered_engines.py       |   0
 tests/py/models/utils.py                      |  14 +++
 12 files changed, 75 insertions(+), 89 deletions(-)
 rename tests/py/{api => models}/custom_models.py (100%)
 rename tests/py/{api => models}/test_models.py (100%)
 rename tests/py/{api => models}/test_multiple_registered_engines.py (100%)
 create mode 100644 tests/py/models/utils.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index dcbc84cc9a..16dda8609f 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -435,6 +435,7 @@ commands:
             mkdir -p /tmp/artifacts/test_results
             cd tests/py
             pytest --junitxml=/tmp/artifacts/test_results/api/api_test_results.xml api/
+            pytest --junitxml=/tmp/artifacts/test_results/models/models_test_results.xml models/
             pytest --junitxml=/tmp/artifacts/test_results/integrations/integrations_test_results.xml integrations/
             cd ~/project
 
diff --git a/noxfile.py b/noxfile.py
index 41926b5ee1..2b8e2da9b3 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -30,13 +30,15 @@
 if USE_HOST_DEPS:
     print("Using dependencies from host python")
 
+# Set epochs to train VGG model for accuracy tests
+EPOCHS=25
+
 SUPPORTED_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
 
 nox.options.sessions = [
     "l0_api_tests-" + "{}.{}".format(sys.version_info.major, sys.version_info.minor)
 ]
 
-
 def install_deps(session):
     print("Installing deps")
     session.install("-r", os.path.join(TOP_DIR, "py", "requirements.txt"))
@@ -63,31 +65,6 @@ def install_torch_trt(session):
         session.run("python", "setup.py", "develop")
 
 
-def download_datasets(session):
-    print(
-        "Downloading dataset to path",
-        os.path.join(TOP_DIR, "examples/int8/training/vgg16"),
-    )
-    session.chdir(os.path.join(TOP_DIR, "examples/int8/training/vgg16"))
-    session.run_always(
-        "wget", "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz", external=True
-    )
-    session.run_always("tar", "-xvzf", "cifar-10-binary.tar.gz", external=True)
-    session.run_always(
-        "mkdir",
-        "-p",
-        os.path.join(TOP_DIR, "tests/accuracy/datasets/data"),
-        external=True,
-    )
-    session.run_always(
-        "cp",
-        "-rpf",
-        os.path.join(TOP_DIR, "examples/int8/training/vgg16/cifar-10-batches-bin"),
-        os.path.join(TOP_DIR, "tests/accuracy/datasets/data/cidar-10-batches-bin"),
-        external=True,
-    )
-
-
 def train_model(session):
     session.chdir(os.path.join(TOP_DIR, "examples/int8/training/vgg16"))
     session.install("-r", "requirements.txt")
@@ -107,14 +84,14 @@ def train_model(session):
             "--ckpt-dir",
             "vgg16_ckpts",
             "--epochs",
-            "25",
+            str(EPOCHS),
             env={"PYTHONPATH": PYT_PATH},
         )
 
         session.run_always(
             "python",
             "export_ckpt.py",
-            "vgg16_ckpts/ckpt_epoch25.pth",
+            "vgg16_ckpts/ckpt_epoch" + str(EPOCHS) + ".pth",
             env={"PYTHONPATH": PYT_PATH},
         )
     else:
@@ -130,10 +107,10 @@ def train_model(session):
             "--ckpt-dir",
             "vgg16_ckpts",
             "--epochs",
-            "25",
+            str(EPOCHS),
         )
 
-        session.run_always("python", "export_ckpt.py", "vgg16_ckpts/ckpt_epoch25.pth")
+        session.run_always("python", "export_ckpt.py", "vgg16_ckpts/ckpt_epoch" + str(EPOCHS) + ".pth")
 
 
 def finetune_model(session):
@@ -156,9 +133,9 @@ def finetune_model(session):
             "--ckpt-dir",
             "vgg16_ckpts",
             "--start-from",
-            "25",
+            str(EPOCHS),
             "--epochs",
-            "26",
+            str(EPOCHS+1),
             env={"PYTHONPATH": PYT_PATH},
         )
 
@@ -166,7 +143,7 @@ def finetune_model(session):
         session.run_always(
             "python",
             "export_qat.py",
-            "vgg16_ckpts/ckpt_epoch26.pth",
+            "vgg16_ckpts/ckpt_epoch" + str(EPOCHS+1) + ".pth",
             env={"PYTHONPATH": PYT_PATH},
         )
     else:
@@ -182,13 +159,13 @@ def finetune_model(session):
             "--ckpt-dir",
             "vgg16_ckpts",
             "--start-from",
-            "25",
+            str(EPOCHS),
             "--epochs",
-            "26",
+            str(EPOCHS+1),
         )
 
         # Export model
-        session.run_always("python", "export_qat.py", "vgg16_ckpts/ckpt_epoch26.pth")
+        session.run_always("python", "export_qat.py", "vgg16_ckpts/ckpt_epoch" + str(EPOCHS+1) + ".pth")
 
 
 def cleanup(session):
@@ -209,7 +186,7 @@ def run_base_tests(session):
     print("Running basic tests")
     session.chdir(os.path.join(TOP_DIR, "tests/py"))
     tests = [
-        "api",
+        "api/test_e2e_behavior.py",
         "integrations/test_to_backend_api.py",
     ]
     for test in tests:
@@ -218,6 +195,18 @@ def run_base_tests(session):
         else:
             session.run_always("pytest", test)
 
+def run_model_tests(session):
+    print("Running model tests")
+    session.chdir(os.path.join(TOP_DIR, "tests/py"))
+    tests = [
+        "models",
+    ]
+    for test in tests:
+        if USE_HOST_DEPS:
+            session.run_always("pytest", test, env={"PYTHONPATH": PYT_PATH})
+        else:
+            session.run_always("pytest", test)
+
 
 def run_accuracy_tests(session):
     print("Running accuracy tests")
@@ -268,8 +257,8 @@ def run_trt_compatibility_tests(session):
     copy_model(session)
     session.chdir(os.path.join(TOP_DIR, "tests/py"))
     tests = [
-        "test_trt_intercompatibility.py",
-        "test_ptq_trt_calibrator.py",
+        "integrations/test_trt_intercompatibility.py",
+        #"ptq/test_ptq_trt_calibrator.py",
     ]
     for test in tests:
         if USE_HOST_DEPS:
@@ -282,7 +271,7 @@ def run_dla_tests(session):
     print("Running DLA tests")
     session.chdir(os.path.join(TOP_DIR, "tests/py"))
     tests = [
-        "test_api_dla.py",
+        "hw/test_api_dla.py",
     ]
     for test in tests:
         if USE_HOST_DEPS:
@@ -295,7 +284,7 @@ def run_multi_gpu_tests(session):
     print("Running multi GPU tests")
     session.chdir(os.path.join(TOP_DIR, "tests/py"))
     tests = [
-        "test_multi_gpu.py",
+        "hw/test_multi_gpu.py",
     ]
     for test in tests:
         if USE_HOST_DEPS:
@@ -321,22 +310,18 @@ def run_l0_dla_tests(session):
     run_base_tests(session)
     cleanup(session)
 
-
-def run_l1_accuracy_tests(session):
+def run_l1_model_tests(session):
     if not USE_HOST_DEPS:
         install_deps(session)
         install_torch_trt(session)
-    download_datasets(session)
-    train_model(session)
-    run_accuracy_tests(session)
+    download_models(session)
+    run_model_tests(session)
     cleanup(session)
 
-
 def run_l1_int8_accuracy_tests(session):
     if not USE_HOST_DEPS:
         install_deps(session)
         install_torch_trt(session)
-    download_datasets(session)
     train_model(session)
     finetune_model(session)
     run_int8_accuracy_tests(session)
@@ -348,7 +333,6 @@ def run_l2_trt_compatibility_tests(session):
         install_deps(session)
         install_torch_trt(session)
     download_models(session)
-    download_datasets(session)
     train_model(session)
     run_trt_compatibility_tests(session)
     cleanup(session)
@@ -368,18 +352,15 @@ def l0_api_tests(session):
     """When a developer needs to check correctness for a PR or something"""
     run_l0_api_tests(session)
 
-
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
 def l0_dla_tests(session):
     """When a developer needs to check basic api functionality using host dependencies"""
     run_l0_dla_tests(session)
 
-
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
-def l1_accuracy_tests(session):
-    """Checking accuracy performance on various usecases"""
-    run_l1_accuracy_tests(session)
-
+def l1_model_tests(session):
+    """When a developer needs to check correctness for a PR or something"""
+    run_l1_model_tests(session)
 
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
 def l1_int8_accuracy_tests(session):
@@ -397,13 +378,3 @@ def l2_trt_compatibility_tests(session):
 def l2_multi_gpu_tests(session):
     """Makes sure that Torch-TensorRT can operate on multi-gpu systems"""
     run_l2_multi_gpu_tests(session)
-
-
-@nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
-def download_test_models(session):
-    """Grab all the models needed for testing"""
-    try:
-        import torch
-    except ModuleNotFoundError:
-        install_deps(session)
-    download_models(session)
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 154b29dd7b..8f24bc76ad 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -225,8 +225,8 @@ def _parse_input_signature(input_signature: Any):
 
 
 def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec:
-    # TODO: Remove deep copy once collections does not need partial compilation
-    compile_spec = deepcopy(compile_spec_)
+    # TODO: Use deepcopy to support partial compilation of collections
+    compile_spec = compile_spec_
     info = _ts_C.CompileSpec()
 
     if len(compile_spec["inputs"]) > 0:
@@ -301,7 +301,7 @@ def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec:
             compile_spec["enabled_precisions"]
         )
 
-    if "calibrator" in compile_spec:
+    if "calibrator" in compile_spec and compile_spec["calibrator"]:
         info.ptq_calibrator = compile_spec["calibrator"]
 
     if "sparse_weights" in compile_spec:
diff --git a/tests/py/api/test_embed_engines.py b/tests/py/api/test_embed_engines.py
index 133c4c6a50..15bbffa62b 100644
--- a/tests/py/api/test_embed_engines.py
+++ b/tests/py/api/test_embed_engines.py
@@ -4,7 +4,6 @@
 import torchvision.models as models
 import copy
 import timm
-import custom_models as cm
 from typing import Dict
 from utils import cosine_similarity, COSINE_THRESHOLD
 
diff --git a/tests/py/hw/test_api_dla.py b/tests/py/hw/test_api_dla.py
index 57b149faa7..ae6005bb1b 100644
--- a/tests/py/hw/test_api_dla.py
+++ b/tests/py/hw/test_api_dla.py
@@ -2,6 +2,7 @@
 import torch_tensorrt as torchtrt
 import torch
 import torchvision.models as models
+from utils import cosine_similarity, COSINE_THRESHOLD
 
 
 class ModelTestCaseOnDLA(unittest.TestCase):
@@ -39,8 +40,8 @@ def test_compile_traced(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"ModelTestCaseOnDLA traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_compile_script(self):
         compile_spec = {
@@ -55,8 +56,8 @@ def test_compile_script(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-2)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"ModelTestCaseOnDLA scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 def test_suite():
diff --git a/tests/py/hw/test_multi_gpu.py b/tests/py/hw/test_multi_gpu.py
index c068cc71b0..033404c927 100644
--- a/tests/py/hw/test_multi_gpu.py
+++ b/tests/py/hw/test_multi_gpu.py
@@ -35,9 +35,9 @@ def test_compile_traced(self):
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
         torchtrt.set_device(self.target_gpu)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
         torchtrt.set_device(0)
-        self.assertTrue(same < 2e-3)
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSwitching traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_compile_script(self):
         torchtrt.set_device(0)
@@ -54,9 +54,10 @@ def test_compile_script(self):
 
         trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
         torchtrt.set_device(self.target_gpu)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
         torchtrt.set_device(0)
-        self.assertTrue(same < 2e-3)
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSwitching scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
 
 
 class TestMultiGpuSerializeDeserializeSwitching(ModelTestCase):
@@ -89,8 +90,8 @@ def test_compile_traced(self):
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
         # Changing the device ID deliberately. It should still run on correct device ID by context switching
         torchtrt.set_device(1)
-        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSerializeDeserializeSwitching traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
     def test_compile_script(self):
         torchtrt.set_device(0)
@@ -108,8 +109,8 @@ def test_compile_script(self):
         trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
         # Changing the device ID deliberately. It should still run on correct device ID by context switching
         torchtrt.set_device(1)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
-        self.assertTrue(same < 2e-3)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSerializeDeserializeSwitching scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 def test_suite():
diff --git a/tests/py/integrations/test_to_backend_api.py b/tests/py/integrations/test_to_backend_api.py
index 16d839b1b0..b860d0333c 100644
--- a/tests/py/integrations/test_to_backend_api.py
+++ b/tests/py/integrations/test_to_backend_api.py
@@ -2,7 +2,7 @@
 import torch_tensorrt as torchtrt
 import torch
 import torchvision.models as models
-
+from utils import cosine_similarity, COSINE_THRESHOLD
 
 class TestToBackendLowering(unittest.TestCase):
     def setUp(self):
@@ -31,10 +31,9 @@ def setUp(self):
 
     def test_to_backend_lowering(self):
         trt_mod = torch._C._jit_to_backend("tensorrt", self.scripted_model, self.spec)
-        same = (
-            (trt_mod.forward(self.input) - self.scripted_model(self.input)).abs().max()
-        )
-        self.assertTrue(same < 2e-3)
+        cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
+        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestToBackendLowering TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+
 
 
 if __name__ == "__main__":
diff --git a/tests/py/integrations/test_trt_intercompatibility.py b/tests/py/integrations/test_trt_intercompatibility.py
index 96b47b7ccc..e82f1e54ca 100644
--- a/tests/py/integrations/test_trt_intercompatibility.py
+++ b/tests/py/integrations/test_trt_intercompatibility.py
@@ -3,7 +3,7 @@
 import torch
 import torchvision.models as models
 import tensorrt as trt
-
+from utils import cosine_similarity, COSINE_THRESHOLD
 
 class TestPyTorchToTRTEngine(unittest.TestCase):
     def test_pt_to_trt(self):
@@ -42,8 +42,8 @@ def test_pt_to_trt(self):
                         device="cuda:0"
                     ).cuda_stream,
                 )
-                same = (out - self.ts_model(self.input)).abs().max()
-                self.assertTrue(same < 2e-3)
+                cos_sim = cosine_similarity(self.model(self.input), out)
+                self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestPyTorchToTRTEngine TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 
 if __name__ == "__main__":
diff --git a/tests/py/api/custom_models.py b/tests/py/models/custom_models.py
similarity index 100%
rename from tests/py/api/custom_models.py
rename to tests/py/models/custom_models.py
diff --git a/tests/py/api/test_models.py b/tests/py/models/test_models.py
similarity index 100%
rename from tests/py/api/test_models.py
rename to tests/py/models/test_models.py
diff --git a/tests/py/api/test_multiple_registered_engines.py b/tests/py/models/test_multiple_registered_engines.py
similarity index 100%
rename from tests/py/api/test_multiple_registered_engines.py
rename to tests/py/models/test_multiple_registered_engines.py
diff --git a/tests/py/models/utils.py b/tests/py/models/utils.py
new file mode 100644
index 0000000000..a43b54a4a7
--- /dev/null
+++ b/tests/py/models/utils.py
@@ -0,0 +1,14 @@
+import torch
+
+COSINE_THRESHOLD=0.99
+
+def cosine_similarity(gt_tensor, pred_tensor):
+    gt_tensor = gt_tensor.flatten().to(torch.float32)
+    pred_tensor = pred_tensor.flatten().to(torch.float32)
+    if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
+        if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
+            return 1.0
+    res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
+    res = res.cpu().detach().item()
+
+    return res

From 7e6b36cf0e87922232a86751deba379bd387d0c4 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Wed, 31 Aug 2022 17:22:21 -0700
Subject: [PATCH 04/12] chore: add missing scripts

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 tests/py/hw/utils.py           | 14 ++++++++++++++
 tests/py/integrations/utils.py | 14 ++++++++++++++
 tests/py/utils.py              | 14 ++++++++++++++
 3 files changed, 42 insertions(+)
 create mode 100644 tests/py/hw/utils.py
 create mode 100644 tests/py/integrations/utils.py
 create mode 100644 tests/py/utils.py

diff --git a/tests/py/hw/utils.py b/tests/py/hw/utils.py
new file mode 100644
index 0000000000..a43b54a4a7
--- /dev/null
+++ b/tests/py/hw/utils.py
@@ -0,0 +1,14 @@
+import torch
+
+COSINE_THRESHOLD=0.99
+
+def cosine_similarity(gt_tensor, pred_tensor):
+    gt_tensor = gt_tensor.flatten().to(torch.float32)
+    pred_tensor = pred_tensor.flatten().to(torch.float32)
+    if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
+        if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
+            return 1.0
+    res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
+    res = res.cpu().detach().item()
+
+    return res
diff --git a/tests/py/integrations/utils.py b/tests/py/integrations/utils.py
new file mode 100644
index 0000000000..a43b54a4a7
--- /dev/null
+++ b/tests/py/integrations/utils.py
@@ -0,0 +1,14 @@
+import torch
+
+COSINE_THRESHOLD=0.99
+
+def cosine_similarity(gt_tensor, pred_tensor):
+    gt_tensor = gt_tensor.flatten().to(torch.float32)
+    pred_tensor = pred_tensor.flatten().to(torch.float32)
+    if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
+        if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
+            return 1.0
+    res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
+    res = res.cpu().detach().item()
+
+    return res
diff --git a/tests/py/utils.py b/tests/py/utils.py
new file mode 100644
index 0000000000..a43b54a4a7
--- /dev/null
+++ b/tests/py/utils.py
@@ -0,0 +1,14 @@
+import torch
+
+COSINE_THRESHOLD=0.99
+
+def cosine_similarity(gt_tensor, pred_tensor):
+    gt_tensor = gt_tensor.flatten().to(torch.float32)
+    pred_tensor = pred_tensor.flatten().to(torch.float32)
+    if torch.sum(gt_tensor) == 0.0 or torch.sum(pred_tensor) == 0.0:
+        if torch.allclose(gt_tensor, pred_tensor, atol=1e-4, rtol=1e-4, equal_nan=True):
+            return 1.0
+    res = torch.nn.functional.cosine_similarity(gt_tensor, pred_tensor, dim=0, eps=1e-6)
+    res = res.cpu().detach().item()
+
+    return res

From ed75e9da13596e6bf11f5b1f6402392865f12159 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Wed, 31 Aug 2022 17:32:32 -0700
Subject: [PATCH 05/12] chore: Linter fixes

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 .github/workflows/docgen.yml                  |  2 +-
 .github/workflows/linter.yml                  |  2 +-
 noxfile.py                                    | 27 ++++++++---
 tests/cpp/test_collections.cpp                |  2 +-
 tests/cpp/test_modules_as_engines.cpp         |  4 +-
 tests/py/api/test_collections.py              | 42 +++++++++++++----
 tests/py/api/test_e2e_behavior.py             |  1 +
 tests/py/api/test_embed_engines.py            | 24 ++++++++--
 tests/py/api/test_module_fallback.py          | 15 ++++--
 tests/py/api/test_operator_fallback.py        | 11 ++++-
 tests/py/api/test_ts_backend.py               | 32 ++++++++++---
 tests/py/api/utils.py                         |  3 +-
 tests/py/hw/test_api_dla.py                   | 10 +++-
 tests/py/hw/test_multi_gpu.py                 | 21 +++++++--
 tests/py/hw/utils.py                          |  3 +-
 tests/py/integrations/test_to_backend_api.py  |  7 ++-
 .../test_trt_intercompatibility.py            |  6 ++-
 tests/py/integrations/utils.py                |  3 +-
 tests/py/models/custom_models.py              |  1 +
 tests/py/models/test_models.py                | 47 +++++++++++++++----
 .../test_multiple_registered_engines.py       | 20 ++++++--
 tests/py/models/utils.py                      |  3 +-
 tests/py/utils.py                             |  3 +-
 tests/util/util.cpp                           |  6 +--
 24 files changed, 227 insertions(+), 68 deletions(-)

diff --git a/.github/workflows/docgen.yml b/.github/workflows/docgen.yml
index 7b66b98be5..61af5bc5d9 100644
--- a/.github/workflows/docgen.yml
+++ b/.github/workflows/docgen.yml
@@ -31,7 +31,7 @@ jobs:
       - name: Set up Python 3.9.4
         uses: actions/setup-python@v2
         with:
-          python-version: 3.9.4 
+          python-version: 3.9.4
       - uses: actions/checkout@v2
         with:
           ref: ${{github.head_ref}}
diff --git a/.github/workflows/linter.yml b/.github/workflows/linter.yml
index 58c8440684..b56a233169 100644
--- a/.github/workflows/linter.yml
+++ b/.github/workflows/linter.yml
@@ -39,7 +39,7 @@ jobs:
           pip3 install -r $GITHUB_WORKSPACE/.github/scripts/requirements.txt
           pip3 install -r $GITHUB_WORKSPACE/requirements-dev.txt
       - name: Lint C++
-        run: | 
+        run: |
           cd $GITHUB_WORKSPACE
           python3 $GITHUB_WORKSPACE/.github/scripts/run_cpp_linter.py
         env:
diff --git a/noxfile.py b/noxfile.py
index 2b8e2da9b3..1f7c1433af 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -31,7 +31,7 @@
     print("Using dependencies from host python")
 
 # Set epochs to train VGG model for accuracy tests
-EPOCHS=25
+EPOCHS = 25
 
 SUPPORTED_PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
 
@@ -39,6 +39,7 @@
     "l0_api_tests-" + "{}.{}".format(sys.version_info.major, sys.version_info.minor)
 ]
 
+
 def install_deps(session):
     print("Installing deps")
     session.install("-r", os.path.join(TOP_DIR, "py", "requirements.txt"))
@@ -110,7 +111,9 @@ def train_model(session):
             str(EPOCHS),
         )
 
-        session.run_always("python", "export_ckpt.py", "vgg16_ckpts/ckpt_epoch" + str(EPOCHS) + ".pth")
+        session.run_always(
+            "python", "export_ckpt.py", "vgg16_ckpts/ckpt_epoch" + str(EPOCHS) + ".pth"
+        )
 
 
 def finetune_model(session):
@@ -135,7 +138,7 @@ def finetune_model(session):
             "--start-from",
             str(EPOCHS),
             "--epochs",
-            str(EPOCHS+1),
+            str(EPOCHS + 1),
             env={"PYTHONPATH": PYT_PATH},
         )
 
@@ -143,7 +146,7 @@ def finetune_model(session):
         session.run_always(
             "python",
             "export_qat.py",
-            "vgg16_ckpts/ckpt_epoch" + str(EPOCHS+1) + ".pth",
+            "vgg16_ckpts/ckpt_epoch" + str(EPOCHS + 1) + ".pth",
             env={"PYTHONPATH": PYT_PATH},
         )
     else:
@@ -161,11 +164,15 @@ def finetune_model(session):
             "--start-from",
             str(EPOCHS),
             "--epochs",
-            str(EPOCHS+1),
+            str(EPOCHS + 1),
         )
 
         # Export model
-        session.run_always("python", "export_qat.py", "vgg16_ckpts/ckpt_epoch" + str(EPOCHS+1) + ".pth")
+        session.run_always(
+            "python",
+            "export_qat.py",
+            "vgg16_ckpts/ckpt_epoch" + str(EPOCHS + 1) + ".pth",
+        )
 
 
 def cleanup(session):
@@ -195,6 +202,7 @@ def run_base_tests(session):
         else:
             session.run_always("pytest", test)
 
+
 def run_model_tests(session):
     print("Running model tests")
     session.chdir(os.path.join(TOP_DIR, "tests/py"))
@@ -258,7 +266,7 @@ def run_trt_compatibility_tests(session):
     session.chdir(os.path.join(TOP_DIR, "tests/py"))
     tests = [
         "integrations/test_trt_intercompatibility.py",
-        #"ptq/test_ptq_trt_calibrator.py",
+        # "ptq/test_ptq_trt_calibrator.py",
     ]
     for test in tests:
         if USE_HOST_DEPS:
@@ -310,6 +318,7 @@ def run_l0_dla_tests(session):
     run_base_tests(session)
     cleanup(session)
 
+
 def run_l1_model_tests(session):
     if not USE_HOST_DEPS:
         install_deps(session)
@@ -318,6 +327,7 @@ def run_l1_model_tests(session):
     run_model_tests(session)
     cleanup(session)
 
+
 def run_l1_int8_accuracy_tests(session):
     if not USE_HOST_DEPS:
         install_deps(session)
@@ -352,16 +362,19 @@ def l0_api_tests(session):
     """When a developer needs to check correctness for a PR or something"""
     run_l0_api_tests(session)
 
+
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
 def l0_dla_tests(session):
     """When a developer needs to check basic api functionality using host dependencies"""
     run_l0_dla_tests(session)
 
+
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
 def l1_model_tests(session):
     """When a developer needs to check correctness for a PR or something"""
     run_l1_model_tests(session)
 
+
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
 def l1_int8_accuracy_tests(session):
     """Checking accuracy performance on various usecases"""
diff --git a/tests/cpp/test_collections.cpp b/tests/cpp/test_collections.cpp
index 3318aec99d..e3f0d91dfe 100644
--- a/tests/cpp/test_collections.cpp
+++ b/tests/cpp/test_collections.cpp
@@ -192,7 +192,7 @@ TEST(CppAPITests, TestCollectionTupleInputOutput) {
   auto trt_out = trt_mod.forward(complex_inputs);
 
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
-        out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
+      out.toTuple()->elements()[0].toTensor(), trt_out.toTuple()->elements()[0].toTensor(), 1e-5));
   ASSERT_TRUE(torch_tensorrt::tests::util::almostEqual(
       out.toTuple()->elements()[1].toTensor(), trt_out.toTuple()->elements()[1].toTensor(), 1e-5));
 }
diff --git a/tests/cpp/test_modules_as_engines.cpp b/tests/cpp/test_modules_as_engines.cpp
index 21670acdaf..430ce8201e 100644
--- a/tests/cpp/test_modules_as_engines.cpp
+++ b/tests/cpp/test_modules_as_engines.cpp
@@ -14,8 +14,8 @@ TEST_P(CppAPITests, ModuleAsEngineIsClose) {
   jit_results.push_back(jit_results_ivalues.toTensor());
   auto trt_results = torch_tensorrt::tests::util::RunModuleForwardAsEngine(mod, inputs);
 
-  ASSERT_TRUE(
-      torch_tensorrt::tests::util::cosineSimEqual(jit_results[0], trt_results[0].reshape_as(jit_results[0]), threshold));
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(
+      jit_results[0], trt_results[0].reshape_as(jit_results[0]), threshold));
 }
 
 #ifndef DISABLE_TEST_IN_CI
diff --git a/tests/py/api/test_collections.py b/tests/py/api/test_collections.py
index 88147e005e..936a4d5c73 100644
--- a/tests/py/api/test_collections.py
+++ b/tests/py/api/test_collections.py
@@ -41,8 +41,13 @@ def test_compile(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        cos_sim = cosine_similarity(self.model(self.input, self.input), trt_mod(self.input, self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"standard_tensor_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        cos_sim = cosine_similarity(
+            self.model(self.input, self.input), trt_mod(self.input, self.input)
+        )
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"standard_tensor_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 class TestTupleInput(unittest.TestCase):
@@ -65,8 +70,13 @@ def test_compile(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        cos_sim = cosine_similarity(self.model((self.input, self.input)), trt_mod((self.input, self.input)))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"tuple_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        cos_sim = cosine_similarity(
+            self.model((self.input, self.input)), trt_mod((self.input, self.input))
+        )
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"tuple_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 class TestListInput(unittest.TestCase):
@@ -87,8 +97,13 @@ def test_compile(self):
         }
 
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
-        cos_sim = cosine_similarity(self.model([self.input, self.input]), trt_mod([self.input, self.input]))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"list_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        cos_sim = cosine_similarity(
+            self.model([self.input, self.input]), trt_mod([self.input, self.input])
+        )
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"list_input_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 class TestTupleInputOutput(unittest.TestCase):
@@ -115,7 +130,10 @@ def test_compile(self):
         pyt_out = self.model((self.input, self.input))
         for (t, p) in zip(trt_out, pyt_out):
             cos_sim = cosine_similarity(t, p)
-            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"tuple_input_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+            self.assertTrue(
+                cos_sim > COSINE_THRESHOLD,
+                msg=f"tuple_input_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            )
 
 
 class TestListInputOutput(unittest.TestCase):
@@ -143,7 +161,10 @@ def test_compile(self):
 
         for (t, p) in zip(trt_out, pyt_out):
             cos_sim = cosine_similarity(t, p)
-            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"list_input_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+            self.assertTrue(
+                cos_sim > COSINE_THRESHOLD,
+                msg=f"list_input_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            )
 
 
 class TestListInputTupleOutput(unittest.TestCase):
@@ -170,7 +191,10 @@ def test_compile(self):
         pyt_out = self.model((self.input, self.input))
         for (t, p) in zip(trt_out, pyt_out):
             cos_sim = cosine_similarity(t, p)
-            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"list_input_tuple_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+            self.assertTrue(
+                cos_sim > COSINE_THRESHOLD,
+                msg=f"list_input_tuple_output_scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            )
 
 
 if __name__ == "__main__":
diff --git a/tests/py/api/test_e2e_behavior.py b/tests/py/api/test_e2e_behavior.py
index 35cd3509dc..385fe916f4 100644
--- a/tests/py/api/test_e2e_behavior.py
+++ b/tests/py/api/test_e2e_behavior.py
@@ -5,6 +5,7 @@
 import copy
 from typing import Dict
 
+
 class TestInputTypeDefaultsFP32Model(unittest.TestCase):
     def test_input_use_default_fp32(self):
         self.model = models.resnet18(pretrained=True).eval().to("cuda")
diff --git a/tests/py/api/test_embed_engines.py b/tests/py/api/test_embed_engines.py
index 15bbffa62b..d21e139eca 100644
--- a/tests/py/api/test_embed_engines.py
+++ b/tests/py/api/test_embed_engines.py
@@ -7,6 +7,7 @@
 from typing import Dict
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestModelToEngineToModel(unittest.TestCase):
     def test_resnet50(self):
         self.model = models.resnet50(pretrained=True).eval().to("cuda")
@@ -26,13 +27,20 @@ def test_resnet50(self):
         }
 
         self.scripted_model = torch.jit.script(self.model)
-        trt_engine = torchtrt.ts.convert_method_to_trt_engine(self.scripted_model, "forward", **compile_spec)
+        trt_engine = torchtrt.ts.convert_method_to_trt_engine(
+            self.scripted_model, "forward", **compile_spec
+        )
         trt_mod = torchtrt.ts.embed_engine_in_new_module(trt_engine)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_efficientnet_b0(self):
-        self.model = timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda")
+        self.model = (
+            timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda")
+        )
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
 
         compile_spec = {
@@ -49,11 +57,17 @@ def test_efficientnet_b0(self):
         }
 
         self.scripted_model = torch.jit.script(self.model)
-        trt_engine = torchtrt.ts.convert_method_to_trt_engine(self.scripted_model, "forward", **compile_spec)
+        trt_engine = torchtrt.ts.convert_method_to_trt_engine(
+            self.scripted_model, "forward", **compile_spec
+        )
         trt_mod = torchtrt.ts.embed_engine_in_new_module(trt_engine)
 
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/py/api/test_module_fallback.py b/tests/py/api/test_module_fallback.py
index 5d5fc425c2..5eda2cdbfc 100644
--- a/tests/py/api/test_module_fallback.py
+++ b/tests/py/api/test_module_fallback.py
@@ -6,6 +6,7 @@
 from typing import Dict
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestModuleFallback(unittest.TestCase):
     def test_fallback_resnet18(self):
         self.model = models.resnet18(pretrained=True).eval().to("cuda")
@@ -25,7 +26,10 @@ def test_fallback_resnet18(self):
         }
         trt_mod = torchtrt.compile(self.model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_fallback_mobilenet_v2(self):
         self.model = models.mobilenet_v2(pretrained=True).eval().to("cuda")
@@ -41,12 +45,17 @@ def test_fallback_mobilenet_v2(self):
                 "gpu_id": 0,
             },
             "enabled_precisions": {torch.float},
-            "torch_executed_modules": ["torchvision.models.mobilenetv2.ConvBNActivation"],
+            "torch_executed_modules": [
+                "torchvision.models.mobilenetv2.ConvBNActivation"
+            ],
             "min_block_size": 5,
         }
         trt_mod = torchtrt.compile(self.model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Mobilenet V2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Mobilenet V2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/py/api/test_operator_fallback.py b/tests/py/api/test_operator_fallback.py
index 25d1b7cd92..302a663e24 100644
--- a/tests/py/api/test_operator_fallback.py
+++ b/tests/py/api/test_operator_fallback.py
@@ -6,6 +6,7 @@
 from typing import Dict
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestFallbackModels(unittest.TestCase):
     def test_fallback_resnet18(self):
         self.model = models.resnet18(pretrained=True).eval().to("cuda")
@@ -25,7 +26,10 @@ def test_fallback_resnet18(self):
         }
         trt_mod = torchtrt.compile(self.model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_fallback_mobilenet_v2(self):
         self.model = models.mobilenet_v2(pretrained=True).eval().to("cuda")
@@ -45,7 +49,10 @@ def test_fallback_mobilenet_v2(self):
         }
         trt_mod = torchtrt.compile(self.model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Mobilenet V2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Mobilenet V2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/py/api/test_ts_backend.py b/tests/py/api/test_ts_backend.py
index 891f4ba178..e56ab4f902 100644
--- a/tests/py/api/test_ts_backend.py
+++ b/tests/py/api/test_ts_backend.py
@@ -6,6 +6,7 @@
 from typing import Dict
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestCompile(unittest.TestCase):
     def test_compile_traced(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -27,7 +28,10 @@ def test_compile_traced(self):
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_compile_script(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -41,7 +45,10 @@ def test_compile_script(self):
                 enabled_precisions={torch.float},
             )
             cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+            self.assertTrue(
+                cos_sim > COSINE_THRESHOLD,
+                msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            )
 
     def test_compile_global(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -54,7 +61,10 @@ def test_compile_global(self):
             enabled_precisions={torch.float},
         )
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_from_torch_tensor(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -71,7 +81,10 @@ def test_from_torch_tensor(self):
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_device(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -85,7 +98,10 @@ def test_device(self):
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_default_device(self):
         self.model = models.vgg16(pretrained=True).eval().to("cuda")
@@ -95,7 +111,11 @@ def test_default_device(self):
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"VGG16 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
 
 class TestCheckMethodOpSupport(unittest.TestCase):
     def test_check_support(self):
diff --git a/tests/py/api/utils.py b/tests/py/api/utils.py
index a43b54a4a7..b1e6632ec3 100644
--- a/tests/py/api/utils.py
+++ b/tests/py/api/utils.py
@@ -1,6 +1,7 @@
 import torch
 
-COSINE_THRESHOLD=0.99
+COSINE_THRESHOLD = 0.99
+
 
 def cosine_similarity(gt_tensor, pred_tensor):
     gt_tensor = gt_tensor.flatten().to(torch.float32)
diff --git a/tests/py/hw/test_api_dla.py b/tests/py/hw/test_api_dla.py
index ae6005bb1b..5328b92233 100644
--- a/tests/py/hw/test_api_dla.py
+++ b/tests/py/hw/test_api_dla.py
@@ -41,7 +41,10 @@ def test_compile_traced(self):
 
         trt_mod = torchtrt.ts.compile(self.traced_model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"ModelTestCaseOnDLA traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"ModelTestCaseOnDLA traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_compile_script(self):
         compile_spec = {
@@ -57,7 +60,10 @@ def test_compile_script(self):
 
         trt_mod = torchtrt.ts.compile(self.scripted_model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"ModelTestCaseOnDLA scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"ModelTestCaseOnDLA scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 def test_suite():
diff --git a/tests/py/hw/test_multi_gpu.py b/tests/py/hw/test_multi_gpu.py
index 033404c927..b6fa3f220b 100644
--- a/tests/py/hw/test_multi_gpu.py
+++ b/tests/py/hw/test_multi_gpu.py
@@ -37,7 +37,10 @@ def test_compile_traced(self):
         torchtrt.set_device(self.target_gpu)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
         torchtrt.set_device(0)
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSwitching traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"TestMultiGpuSwitching traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_compile_script(self):
         torchtrt.set_device(0)
@@ -56,8 +59,10 @@ def test_compile_script(self):
         torchtrt.set_device(self.target_gpu)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
         torchtrt.set_device(0)
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSwitching scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
-
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"TestMultiGpuSwitching scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 class TestMultiGpuSerializeDeserializeSwitching(ModelTestCase):
@@ -91,7 +96,10 @@ def test_compile_traced(self):
         # Changing the device ID deliberately. It should still run on correct device ID by context switching
         torchtrt.set_device(1)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSerializeDeserializeSwitching traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"TestMultiGpuSerializeDeserializeSwitching traced TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_compile_script(self):
         torchtrt.set_device(0)
@@ -110,7 +118,10 @@ def test_compile_script(self):
         # Changing the device ID deliberately. It should still run on correct device ID by context switching
         torchtrt.set_device(1)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestMultiGpuSerializeDeserializeSwitching scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"TestMultiGpuSerializeDeserializeSwitching scripted TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 def test_suite():
diff --git a/tests/py/hw/utils.py b/tests/py/hw/utils.py
index a43b54a4a7..b1e6632ec3 100644
--- a/tests/py/hw/utils.py
+++ b/tests/py/hw/utils.py
@@ -1,6 +1,7 @@
 import torch
 
-COSINE_THRESHOLD=0.99
+COSINE_THRESHOLD = 0.99
+
 
 def cosine_similarity(gt_tensor, pred_tensor):
     gt_tensor = gt_tensor.flatten().to(torch.float32)
diff --git a/tests/py/integrations/test_to_backend_api.py b/tests/py/integrations/test_to_backend_api.py
index b860d0333c..0f74a3af15 100644
--- a/tests/py/integrations/test_to_backend_api.py
+++ b/tests/py/integrations/test_to_backend_api.py
@@ -4,6 +4,7 @@
 import torchvision.models as models
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestToBackendLowering(unittest.TestCase):
     def setUp(self):
         self.input = torch.randn((1, 3, 300, 300)).to("cuda")
@@ -32,8 +33,10 @@ def setUp(self):
     def test_to_backend_lowering(self):
         trt_mod = torch._C._jit_to_backend("tensorrt", self.scripted_model, self.spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestToBackendLowering TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
-
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"TestToBackendLowering TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/py/integrations/test_trt_intercompatibility.py b/tests/py/integrations/test_trt_intercompatibility.py
index e82f1e54ca..b938e4a1ac 100644
--- a/tests/py/integrations/test_trt_intercompatibility.py
+++ b/tests/py/integrations/test_trt_intercompatibility.py
@@ -5,6 +5,7 @@
 import tensorrt as trt
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestPyTorchToTRTEngine(unittest.TestCase):
     def test_pt_to_trt(self):
         self.model = models.resnet18(pretrained=True).eval().to("cuda:0")
@@ -43,7 +44,10 @@ def test_pt_to_trt(self):
                     ).cuda_stream,
                 )
                 cos_sim = cosine_similarity(self.model(self.input), out)
-                self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"TestPyTorchToTRTEngine TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+                self.assertTrue(
+                    cos_sim > COSINE_THRESHOLD,
+                    msg=f"TestPyTorchToTRTEngine TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+                )
 
 
 if __name__ == "__main__":
diff --git a/tests/py/integrations/utils.py b/tests/py/integrations/utils.py
index a43b54a4a7..b1e6632ec3 100644
--- a/tests/py/integrations/utils.py
+++ b/tests/py/integrations/utils.py
@@ -1,6 +1,7 @@
 import torch
 
-COSINE_THRESHOLD=0.99
+COSINE_THRESHOLD = 0.99
+
 
 def cosine_similarity(gt_tensor, pred_tensor):
     gt_tensor = gt_tensor.flatten().to(torch.float32)
diff --git a/tests/py/models/custom_models.py b/tests/py/models/custom_models.py
index c6c0bb4c68..a19b9ca81c 100644
--- a/tests/py/models/custom_models.py
+++ b/tests/py/models/custom_models.py
@@ -1,6 +1,7 @@
 import torch
 from transformers import BertModel, BertTokenizer, BertConfig
 
+
 def BertModule():
     model_name = "bert-base-uncased"
     enc = BertTokenizer.from_pretrained(model_name)
diff --git a/tests/py/models/test_models.py b/tests/py/models/test_models.py
index 84860b9305..97a454c610 100644
--- a/tests/py/models/test_models.py
+++ b/tests/py/models/test_models.py
@@ -8,6 +8,7 @@
 from typing import Dict
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestModels(unittest.TestCase):
     def test_resnet50(self):
         self.model = models.resnet50(pretrained=True).eval().to("cuda")
@@ -28,7 +29,10 @@ def test_resnet50(self):
 
         trt_mod = torchtrt.compile(self.model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_mobilenet_v2(self):
         self.model = models.mobilenet_v2(pretrained=True).eval().to("cuda")
@@ -49,10 +53,15 @@ def test_mobilenet_v2(self):
 
         trt_mod = torchtrt.compile(self.model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Mobilenet v2 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_efficientnet_b0(self):
-        self.model = timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda")
+        self.model = (
+            timm.create_model("efficientnet_b0", pretrained=True).eval().to("cuda")
+        )
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
 
         compile_spec = {
@@ -70,7 +79,10 @@ def test_efficientnet_b0(self):
 
         trt_mod = torchtrt.compile(self.model, **compile_spec)
         cos_sim = cosine_similarity(self.model(self.input), trt_mod(self.input))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"EfficientNet-B0 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
     def test_bert_base_uncased(self):
         self.model = cm.BertModule().cuda()
@@ -78,8 +90,16 @@ def test_bert_base_uncased(self):
 
         compile_spec = {
             "inputs": [
-                torchtrt.Input(self.input.shape, dtype=self.input.dtype, format=torch.contiguous_format),
-                torchtrt.Input(self.input.shape, dtype=self.input.dtype, format=torch.contiguous_format)
+                torchtrt.Input(
+                    self.input.shape,
+                    dtype=self.input.dtype,
+                    format=torch.contiguous_format,
+                ),
+                torchtrt.Input(
+                    self.input.shape,
+                    dtype=self.input.dtype,
+                    format=torch.contiguous_format,
+                ),
             ],
             "device": {
                 "device_type": torchtrt.DeviceType.GPU,
@@ -95,7 +115,10 @@ def test_bert_base_uncased(self):
         trt_model_outputs = trt_mod(self.input, self.input)
         for out, trt_out in zip(model_outputs, trt_model_outputs):
             cos_sim = cosine_similarity(out, trt_out)
-            self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+            self.assertTrue(
+                cos_sim > COSINE_THRESHOLD,
+                msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+            )
 
     def test_resnet50_half(self):
         self.model = models.resnet50(pretrained=True).eval().to("cuda")
@@ -117,8 +140,14 @@ def test_resnet50_half(self):
         }
 
         trt_mod = torchtrt.compile(self.scripted_model, **compile_spec)
-        cos_sim = cosine_similarity(self.model.half()(self.input.half()), trt_mod(self.input.half()))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        cos_sim = cosine_similarity(
+            self.model.half()(self.input.half()), trt_mod(self.input.half())
+        )
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Resnet50 Half TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/py/models/test_multiple_registered_engines.py b/tests/py/models/test_multiple_registered_engines.py
index fb201f9d8f..98f012597b 100644
--- a/tests/py/models/test_multiple_registered_engines.py
+++ b/tests/py/models/test_multiple_registered_engines.py
@@ -8,6 +8,7 @@
 from typing import Dict
 from utils import cosine_similarity, COSINE_THRESHOLD
 
+
 class TestModelToEngineToModel(unittest.TestCase):
     def test_multiple_engines(self):
         self.resnet18 = models.resnet18(pretrained=True).eval().to("cuda")
@@ -30,11 +31,22 @@ def test_multiple_engines(self):
         rn18_trt_mod = torchtrt.compile(self.resnet18, **compile_spec)
         rn50_trt_mod = torchtrt.compile(self.resnet50, **compile_spec)
 
-        cos_sim = cosine_similarity(self.resnet18(self.input1), rn18_trt_mod(self.input1))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
+        cos_sim = cosine_similarity(
+            self.resnet18(self.input1), rn18_trt_mod(self.input1)
+        )
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Resnet18 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
+
+        cos_sim = cosine_similarity(
+            self.resnet50(self.input1), rn50_trt_mod(self.input1)
+        )
+        self.assertTrue(
+            cos_sim > COSINE_THRESHOLD,
+            msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
+        )
 
-        cos_sim = cosine_similarity(self.resnet50(self.input1), rn50_trt_mod(self.input1))
-        self.assertTrue(cos_sim > COSINE_THRESHOLD, msg=f"Resnet50 TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}")
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/py/models/utils.py b/tests/py/models/utils.py
index a43b54a4a7..b1e6632ec3 100644
--- a/tests/py/models/utils.py
+++ b/tests/py/models/utils.py
@@ -1,6 +1,7 @@
 import torch
 
-COSINE_THRESHOLD=0.99
+COSINE_THRESHOLD = 0.99
+
 
 def cosine_similarity(gt_tensor, pred_tensor):
     gt_tensor = gt_tensor.flatten().to(torch.float32)
diff --git a/tests/py/utils.py b/tests/py/utils.py
index a43b54a4a7..b1e6632ec3 100644
--- a/tests/py/utils.py
+++ b/tests/py/utils.py
@@ -1,6 +1,7 @@
 import torch
 
-COSINE_THRESHOLD=0.99
+COSINE_THRESHOLD = 0.99
+
 
 def cosine_similarity(gt_tensor, pred_tensor):
     gt_tensor = gt_tensor.flatten().to(torch.float32)
diff --git a/tests/util/util.cpp b/tests/util/util.cpp
index 91004c06ff..8359d31576 100644
--- a/tests/util/util.cpp
+++ b/tests/util/util.cpp
@@ -6,9 +6,9 @@ namespace torch_tensorrt {
 namespace tests {
 namespace util {
 
-bool cosineSimEqual(const at::Tensor& computed_tensor, const at::Tensor& gt_tensor, float threshold = 0.99f){
-
-  torch::Tensor cosine_sim = torch::nn::functional::cosine_similarity(computed_tensor.flatten(), gt_tensor.flatten(), torch::nn::functional::CosineSimilarityFuncOptions().dim(0));
+bool cosineSimEqual(const at::Tensor& computed_tensor, const at::Tensor& gt_tensor, float threshold = 0.99f) {
+  torch::Tensor cosine_sim = torch::nn::functional::cosine_similarity(
+      computed_tensor.flatten(), gt_tensor.flatten(), torch::nn::functional::CosineSimilarityFuncOptions().dim(0));
   std::ostringstream ss;
   ss << computed_tensor << std::endl << gt_tensor << std::endl;
   LOG_GRAPH(ss.str());

From 3da78e984f0a13e9fea4068c2e60067c4374930d Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Thu, 1 Sep 2022 09:14:10 -0700
Subject: [PATCH 06/12] chore: Minor fix

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 noxfile.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 1f7c1433af..0f6235f612 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -193,7 +193,7 @@ def run_base_tests(session):
     print("Running basic tests")
     session.chdir(os.path.join(TOP_DIR, "tests/py"))
     tests = [
-        "api/test_e2e_behavior.py",
+        "api",
         "integrations/test_to_backend_api.py",
     ]
     for test in tests:
@@ -371,7 +371,7 @@ def l0_dla_tests(session):
 
 @nox.session(python=SUPPORTED_PYTHON_VERSIONS, reuse_venv=True)
 def l1_model_tests(session):
-    """When a developer needs to check correctness for a PR or something"""
+    """When a user needs to test the functionality of standard models compilation and results"""
     run_l1_model_tests(session)
 
 

From 0ca049f672a16f20f47f9c334f070b3b275b6bef Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Thu, 1 Sep 2022 09:21:35 -0700
Subject: [PATCH 07/12] chore: use rn18 instead of rn50

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 tests/cpp/test_modules_as_engines.cpp | 2 +-
 tests/py/models/test_models.py        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/test_modules_as_engines.cpp b/tests/cpp/test_modules_as_engines.cpp
index 430ce8201e..11b7a54fb0 100644
--- a/tests/cpp/test_modules_as_engines.cpp
+++ b/tests/cpp/test_modules_as_engines.cpp
@@ -24,7 +24,7 @@ INSTANTIATE_TEST_SUITE_P(
     ModuleAsEngineForwardIsCloseSuite,
     CppAPITests,
     testing::Values(
-        PathAndInput({"tests/modules/resnet50_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99}),
+        PathAndInput({"tests/modules/resnet18_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99}),
         PathAndInput({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99}),
         PathAndInput({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99}),
         PathAndInput({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 0.99})));
diff --git a/tests/py/models/test_models.py b/tests/py/models/test_models.py
index 97a454c610..6cc9759626 100644
--- a/tests/py/models/test_models.py
+++ b/tests/py/models/test_models.py
@@ -10,8 +10,8 @@
 
 
 class TestModels(unittest.TestCase):
-    def test_resnet50(self):
-        self.model = models.resnet50(pretrained=True).eval().to("cuda")
+    def test_resnet18(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
 
         compile_spec = {
@@ -120,8 +120,8 @@ def test_bert_base_uncased(self):
                 msg=f"HF BERT base-uncased TRT outputs don't match with the original model. Cosine sim score: {cos_sim} Threshold: {COSINE_THRESHOLD}",
             )
 
-    def test_resnet50_half(self):
-        self.model = models.resnet50(pretrained=True).eval().to("cuda")
+    def test_resnet18_half(self):
+        self.model = models.resnet18(pretrained=True).eval().to("cuda")
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
         self.scripted_model = torch.jit.script(self.model)
         self.scripted_model.half()

From c8640963e8e614bd30dd42a43c86091e0c31ed89 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Thu, 1 Sep 2022 13:28:02 -0700
Subject: [PATCH 08/12] chore: Add cpp tests with cosine sim

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 tests/core/partitioning/BUILD                 | 16 ++++
 .../test_fallback_graph_output.cpp            | 69 +++++++++++++++++
 tests/cpp/BUILD                               | 47 ++++++++++++
 tests/cpp/test_compiled_modules.cpp           | 60 +++++++++++++++
 tests/cpp/test_module_fallback.cpp            | 74 +++++++++++++++++++
 .../cpp/test_multiple_registered_engines.cpp  | 66 +++++++++++++++++
 6 files changed, 332 insertions(+)
 create mode 100644 tests/core/partitioning/test_fallback_graph_output.cpp
 create mode 100644 tests/cpp/test_compiled_modules.cpp
 create mode 100644 tests/cpp/test_module_fallback.cpp
 create mode 100644 tests/cpp/test_multiple_registered_engines.cpp

diff --git a/tests/core/partitioning/BUILD b/tests/core/partitioning/BUILD
index 5f90be2972..83722b4271 100644
--- a/tests/core/partitioning/BUILD
+++ b/tests/core/partitioning/BUILD
@@ -55,6 +55,21 @@ cc_test(
     }),
 )
 
+cc_test(
+    name = "test_fallback_graph_output",
+    srcs = ["test_fallback_graph_output.cpp"],
+    data = [
+        ":jit_models",
+    ],
+    deps = [
+        "//tests/util",
+        "@googletest//:gtest_main",
+    ] + select({
+        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
+        "//conditions:default": ["@libtorch//:libtorch"],
+    }),
+)
+
 cc_test(
     name = "test_loop_fallback",
     srcs = ["test_loop_fallback.cpp"],
@@ -89,6 +104,7 @@ test_suite(
     name = "partitioning_tests",
     tests = [
         ":test_conditionals",
+        ":test_fallback_graph_output",
         ":test_loading_model",
         ":test_loop_fallback",
         ":test_resolve_nontensor_inputs",
diff --git a/tests/core/partitioning/test_fallback_graph_output.cpp b/tests/core/partitioning/test_fallback_graph_output.cpp
new file mode 100644
index 0000000000..3da717074a
--- /dev/null
+++ b/tests/core/partitioning/test_fallback_graph_output.cpp
@@ -0,0 +1,69 @@
+#include <string>
+#include <unordered_set>
+#include "core/compiler.h"
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/script.h"
+
+#ifndef DISABLE_TEST_IN_CI
+
+TEST(Partitioning, ComputeResNet50FallbackGraphCorrectly) {
+  torch::jit::script::Module mod;
+  try {
+    mod = torch::jit::load("tests/modules/resnet50_traced.jit.pt");
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return;
+  }
+
+  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
+  std::vector<torch::jit::IValue> jit_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit_inputs_ivalues.push_back(in.clone());
+    trt_inputs_ivalues.push_back(in.clone());
+  }
+
+  std::vector<torch_tensorrt::core::ir::Input> input_ranges{torch_tensorrt::core::ir::Input({1, 3, 224, 224})};
+
+  torch_tensorrt::core::CompileSpec cfg(input_ranges);
+  cfg.partition_info.enabled = true;
+  cfg.partition_info.forced_fallback_operators.push_back("aten::add");
+
+  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
+  auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg);
+  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results, trt_results, 0.99));
+}
+
+TEST(Partitioning, ComputeMobileNetFallbackGraphCorrectly) {
+  torch::jit::script::Module mod;
+  try {
+    mod = torch::jit::load("tests/modules/mobilenet_v2_traced.jit.pt");
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return;
+  }
+
+  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
+  std::vector<torch::jit::IValue> jit_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit_inputs_ivalues.push_back(in.clone());
+    trt_inputs_ivalues.push_back(in.clone());
+  }
+
+  std::vector<torch_tensorrt::core::ir::Input> input_ranges{torch_tensorrt::core::ir::Input({1, 3, 224, 224})};
+  auto g = mod.get_method("forward").graph();
+  torch_tensorrt::core::CompileSpec cfg(input_ranges);
+  cfg.partition_info.enabled = true;
+  cfg.partition_info.forced_fallback_operators.push_back("aten::hardtanh");
+
+  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
+  auto trt_mod = torch_tensorrt::core::CompileGraph(mod, cfg);
+  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results, trt_results, 0.99));
+}
+#endif
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
index ea2c6ae752..3d56682189 100644
--- a/tests/cpp/BUILD
+++ b/tests/cpp/BUILD
@@ -13,9 +13,12 @@ test_suite(
     name = "api_tests",
     tests = [
         ":test_collections",
+        ":test_compiled_modules",
         ":test_default_input_types",
         ":test_example_tensors",
+        ":test_module_fallback",
         ":test_modules_as_engines",
+        ":test_multiple_registered_engines",
         ":test_runtime_thread_safety",
         ":test_serialization",
     ],
@@ -25,9 +28,12 @@ test_suite(
     name = "aarch64_api_tests",
     tests = [
         ":test_collections",
+        ":test_compiled_modules",
         ":test_default_input_types",
         ":test_example_tensors",
+        ":test_module_fallback",
         ":test_modules_as_engines",
+        ":test_multiple_registered_engines",
         ":test_runtime_thread_safety",
         ":test_serialization",
     ],
@@ -66,6 +72,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "test_multiple_registered_engines",
+    srcs = ["test_multiple_registered_engines.cpp"],
+    data = [
+        "//tests/modules:jit_models",
+    ],
+    deps = [
+        "//tests/util",
+        "@googletest//:gtest_main",
+    ] + select({
+        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
+        "//conditions:default": ["@libtorch//:libtorch"],
+    }),
+)
+
 cc_test(
     name = "test_modules_as_engines",
     timeout = "long",
@@ -89,6 +110,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "test_module_fallback",
+    srcs = ["test_module_fallback.cpp"],
+    data = [
+        "//tests/modules:jit_models",
+    ],
+    deps = [
+        "//tests/util",
+        "@googletest//:gtest_main",
+    ] + select({
+        ":use_pre_cxx11_abi": ["@libtorch_pre_cxx11_abi//:libtorch"],
+        "//conditions:default": ["@libtorch//:libtorch"],
+    }),
+)
+
 cc_test(
     name = "test_collections",
     srcs = ["test_collections.cpp"],
@@ -104,6 +140,17 @@ cc_test(
     }),
 )
 
+cc_test(
+    name = "test_compiled_modules",
+    srcs = ["test_compiled_modules.cpp"],
+    data = [
+        "//tests/modules:jit_models",
+    ],
+    deps = [
+        ":cpp_api_test",
+    ],
+)
+
 cc_test(
     name = "test_multi_gpu_serde",
     srcs = ["test_multi_gpu_serde.cpp"],
diff --git a/tests/cpp/test_compiled_modules.cpp b/tests/cpp/test_compiled_modules.cpp
new file mode 100644
index 0000000000..e1e923b47a
--- /dev/null
+++ b/tests/cpp/test_compiled_modules.cpp
@@ -0,0 +1,60 @@
+#include "cpp_api_test.h"
+
+TEST_P(CppAPITests, CompiledModuleIsClose) {
+  std::vector<torch::jit::IValue> jit_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt_inputs_ivalues;
+  std::vector<torch_tensorrt::Input> shapes;
+  for (uint64_t i = 0; i < input_shapes.size(); i++) {
+    auto in = at::randint(5, input_shapes[i], {at::kCUDA}).to(input_types[i]);
+    jit_inputs_ivalues.push_back(in.clone());
+    trt_inputs_ivalues.push_back(in.clone());
+    auto in_spec = torch_tensorrt::Input(input_shapes[i]);
+    in_spec.dtype = input_types[i];
+    shapes.push_back(in_spec);
+    std::cout << in_spec << std::endl;
+  }
+
+  torch::jit::IValue jit_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod, jit_inputs_ivalues);
+  std::vector<at::Tensor> jit_results;
+  if (jit_results_ivalues.isTuple()) {
+    auto tuple = jit_results_ivalues.toTuple();
+    for (auto t : tuple->elements()) {
+      jit_results.push_back(t.toTensor());
+    }
+  } else {
+    jit_results.push_back(jit_results_ivalues.toTensor());
+  }
+
+  auto spec = torch_tensorrt::ts::CompileSpec(shapes);
+  spec.truncate_long_and_double = true;
+
+  auto trt_mod = torch_tensorrt::ts::compile(mod, spec);
+  torch::jit::IValue trt_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(trt_mod, trt_inputs_ivalues);
+  std::vector<at::Tensor> trt_results;
+  if (trt_results_ivalues.isTuple()) {
+    auto tuple = trt_results_ivalues.toTuple();
+    for (auto t : tuple->elements()) {
+      trt_results.push_back(t.toTensor());
+    }
+  } else {
+    trt_results.push_back(trt_results_ivalues.toTensor());
+  }
+
+  for (size_t i = 0; i < trt_results.size(); i++) {
+    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i]), 0.99));
+  }
+}
+
+#ifndef DISABLE_TEST_IN_CI
+
+INSTANTIATE_TEST_SUITE_P(
+    CompiledModuleForwardIsCloseSuite,
+    CppAPITests,
+    testing::Values(
+        PathAndInput({"tests/modules/resnet18_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
+        PathAndInput({"tests/modules/mobilenet_v2_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 2e-5}),
+        PathAndInput({"tests/modules/efficientnet_b0_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 8e-3}),
+        PathAndInput({"tests/modules/bert_base_uncased_traced.jit.pt", {{1, 14}, {1, 14}}, {at::kInt, at::kInt}, 8e-2}),
+        PathAndInput({"tests/modules/vit_scripted.jit.pt", {{1, 3, 224, 224}}, {at::kFloat}, 8e-2})));
+
+#endif
diff --git a/tests/cpp/test_module_fallback.cpp b/tests/cpp/test_module_fallback.cpp
new file mode 100644
index 0000000000..bfdfc46b04
--- /dev/null
+++ b/tests/cpp/test_module_fallback.cpp
@@ -0,0 +1,74 @@
+#include <string>
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/script.h"
+#include "torch_tensorrt/torch_tensorrt.h"
+
+#ifndef DISABLE_TEST_IN_CI
+
+TEST(CppAPITest, ResNetModuleFallbacksCorrectly) {
+  torch::jit::script::Module mod;
+  try {
+    mod = torch::jit::load("tests/modules/resnet18_scripted.jit.pt");
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    ASSERT_TRUE(false);
+  }
+
+  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
+  std::vector<torch::jit::IValue> jit_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit_inputs_ivalues.push_back(in.clone());
+    trt_inputs_ivalues.push_back(in.clone());
+  }
+
+  torch_tensorrt::ts::CompileSpec cfg(input_shapes);
+  cfg.torch_executed_modules.push_back("torchvision.models.resnet.BasicBlock");
+
+  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
+  auto trt_mod = torch_tensorrt::ts::compile(mod, cfg);
+  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results, trt_results, 0.99));
+}
+
+TEST(CppAPITest, MobileNetModuleFallbacksCorrectlyWithOneEngine) {
+  torch::jit::script::Module mod;
+  try {
+    mod = torch::jit::load("tests/modules/mobilenet_v2_scripted.jit.pt");
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    ASSERT_TRUE(false);
+  }
+
+  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
+  std::vector<torch::jit::IValue> jit_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit_inputs_ivalues.push_back(in.clone());
+    trt_inputs_ivalues.push_back(in.clone());
+  }
+
+  torch_tensorrt::ts::CompileSpec cfg(input_shapes);
+  cfg.min_block_size = 5;
+  cfg.torch_executed_modules.push_back("torchvision.models.mobilenetv2.ConvBNActivation");
+
+  auto jit_results = mod.forward(jit_inputs_ivalues).toTensor();
+  auto trt_mod = torch_tensorrt::ts::compile(mod, cfg);
+
+  auto g = trt_mod.get_method("forward").graph();
+  auto nodes = g->block()->nodes();
+  std::size_t trt_count = 0;
+  for (const auto n : nodes) {
+    if (n->kind().toQualString() == std::string("tensorrt::execute_engine")) {
+      trt_count++;
+    }
+  }
+  ASSERT_TRUE(trt_count == 1);
+
+  auto trt_results = trt_mod.forward(trt_inputs_ivalues).toTensor();
+  ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results, trt_results, 0.99));
+}
+#endif
diff --git a/tests/cpp/test_multiple_registered_engines.cpp b/tests/cpp/test_multiple_registered_engines.cpp
new file mode 100644
index 0000000000..16ae4c8a66
--- /dev/null
+++ b/tests/cpp/test_multiple_registered_engines.cpp
@@ -0,0 +1,66 @@
+#include <string>
+#include "gtest/gtest.h"
+#include "tests/util/util.h"
+#include "torch/script.h"
+#include "torch_tensorrt/torch_tensorrt.h"
+
+#ifndef DISABLE_TEST_IN_CI
+
+TEST(CppAPITest, CanRunMultipleEngines) {
+  torch::jit::script::Module mod1;
+  torch::jit::script::Module mod2;
+  try {
+    mod1 = torch::jit::load("tests/modules/resnet18_traced.jit.pt");
+    mod2 = torch::jit::load("tests/modules/resnet18_traced.jit.pt");
+  } catch (const c10::Error& e) {
+    std::cerr << "error loading the model\n";
+    return;
+  }
+
+  const std::vector<std::vector<int64_t>> input_shapes = {{1, 3, 224, 224}};
+
+  std::vector<torch::jit::IValue> jit1_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt1_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit1_inputs_ivalues.push_back(in.clone());
+    trt1_inputs_ivalues.push_back(in.clone());
+  }
+
+  std::vector<torch::jit::IValue> jit2_inputs_ivalues;
+  std::vector<torch::jit::IValue> trt2_inputs_ivalues;
+  for (auto in_shape : input_shapes) {
+    auto in = at::randint(5, in_shape, {at::kCUDA});
+    jit2_inputs_ivalues.push_back(in.clone());
+    trt2_inputs_ivalues.push_back(in.clone());
+  }
+
+  torch::jit::IValue jit1_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod1, jit1_inputs_ivalues);
+  std::vector<at::Tensor> jit1_results;
+  jit1_results.push_back(jit1_results_ivalues.toTensor());
+
+  torch::jit::IValue jit2_results_ivalues = torch_tensorrt::tests::util::RunModuleForward(mod2, jit2_inputs_ivalues);
+  std::vector<at::Tensor> jit2_results;
+  jit2_results.push_back(jit2_results_ivalues.toTensor());
+
+  auto trt_mod1 = torch_tensorrt::ts::compile(mod1, input_shapes);
+  torch::jit::IValue trt1_results_ivalues =
+      torch_tensorrt::tests::util::RunModuleForward(trt_mod1, trt1_inputs_ivalues);
+  std::vector<at::Tensor> trt1_results;
+  trt1_results.push_back(trt1_results_ivalues.toTensor());
+
+  auto trt_mod2 = torch_tensorrt::ts::compile(mod2, input_shapes);
+  torch::jit::IValue trt2_results_ivalues =
+      torch_tensorrt::tests::util::RunModuleForward(trt_mod2, trt2_inputs_ivalues);
+  std::vector<at::Tensor> trt2_results;
+  trt2_results.push_back(trt2_results_ivalues.toTensor());
+
+  for (size_t i = 0; i < trt1_results.size(); i++) {
+    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit1_results[i], trt1_results[i].reshape_as(jit1_results[i]), 0.99));
+  }
+
+  for (size_t i = 0; i < trt2_results.size(); i++) {
+    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit2_results[i], trt2_results[i].reshape_as(jit2_results[i]), 0.99));
+  }
+}
+#endif

From 8d8cbfd747a129367155f0c0695279077c448a23 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Thu, 1 Sep 2022 13:32:11 -0700
Subject: [PATCH 09/12] chore: linter fixes

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 tests/cpp/test_compiled_modules.cpp            | 3 ++-
 tests/cpp/test_multiple_registered_engines.cpp | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/test_compiled_modules.cpp b/tests/cpp/test_compiled_modules.cpp
index e1e923b47a..3a81f0a531 100644
--- a/tests/cpp/test_compiled_modules.cpp
+++ b/tests/cpp/test_compiled_modules.cpp
@@ -41,7 +41,8 @@ TEST_P(CppAPITests, CompiledModuleIsClose) {
   }
 
   for (size_t i = 0; i < trt_results.size(); i++) {
-    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i]), 0.99));
+    ASSERT_TRUE(
+        torch_tensorrt::tests::util::cosineSimEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i]), 0.99));
   }
 }
 
diff --git a/tests/cpp/test_multiple_registered_engines.cpp b/tests/cpp/test_multiple_registered_engines.cpp
index 16ae4c8a66..658f59ca74 100644
--- a/tests/cpp/test_multiple_registered_engines.cpp
+++ b/tests/cpp/test_multiple_registered_engines.cpp
@@ -56,11 +56,13 @@ TEST(CppAPITest, CanRunMultipleEngines) {
   trt2_results.push_back(trt2_results_ivalues.toTensor());
 
   for (size_t i = 0; i < trt1_results.size(); i++) {
-    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit1_results[i], trt1_results[i].reshape_as(jit1_results[i]), 0.99));
+    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(
+        jit1_results[i], trt1_results[i].reshape_as(jit1_results[i]), 0.99));
   }
 
   for (size_t i = 0; i < trt2_results.size(); i++) {
-    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(jit2_results[i], trt2_results[i].reshape_as(jit2_results[i]), 0.99));
+    ASSERT_TRUE(torch_tensorrt::tests::util::cosineSimEqual(
+        jit2_results[i], trt2_results[i].reshape_as(jit2_results[i]), 0.99));
   }
 }
 #endif

From 13cc0248c321783aff3717c5c5179a6b0ee42913 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 6 Sep 2022 11:52:00 -0700
Subject: [PATCH 10/12] chore: Deepcopy other objects

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 py/torch_tensorrt/ts/_compile_spec.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 8f24bc76ad..6f267978ec 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -226,7 +226,13 @@ def _parse_input_signature(input_signature: Any):
 
 def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec:
     # TODO: Use deepcopy to support partial compilation of collections
-    compile_spec = compile_spec_
+    compile_spec = {}
+    for k, v in compile_spec_.items():
+        if k != "calibrator":
+            compile_spec[k] = deepcopy(v)
+        else:
+            compile_spec[k] = v
+
     info = _ts_C.CompileSpec()
 
     if len(compile_spec["inputs"]) > 0:

From 749048cabe83725bd9a0208afb04b39556adbe4d Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 6 Sep 2022 18:07:40 -0700
Subject: [PATCH 11/12] fix: Fix deepcopy issues of PTQ calibrators

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 noxfile.py                                     |  2 --
 py/torch_tensorrt/ptq.py                       | 14 ++++++++++----
 py/torch_tensorrt/ts/_compile_spec.py          |  8 +-------
 tests/py/ptq/test_ptq_dataloader_calibrator.py |  8 ++++----
 4 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/noxfile.py b/noxfile.py
index 0f6235f612..eff8136fbb 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -342,8 +342,6 @@ def run_l2_trt_compatibility_tests(session):
     if not USE_HOST_DEPS:
         install_deps(session)
         install_torch_trt(session)
-    download_models(session)
-    train_model(session)
     run_trt_compatibility_tests(session)
     cleanup(session)
 
diff --git a/py/torch_tensorrt/ptq.py b/py/torch_tensorrt/ptq.py
index 326f35f942..670690b433 100644
--- a/py/torch_tensorrt/ptq.py
+++ b/py/torch_tensorrt/ptq.py
@@ -55,6 +55,11 @@ def write_calibration_cache(self, cache):
     else:
         return b""
 
+# deepcopy (which involves pickling) is performed on the compile_spec internally during compilation.
+# We register this __reduce__ function for pickler to identity the calibrator object returned by DataLoaderCalibrator during deepcopy.
+# This should be the object's local name relative to the module https://docs.python.org/3/library/pickle.html#object.__reduce__
+def __reduce__(self):
+    return self.__class__.__name__
 
 class DataLoaderCalibrator(object):
     """
@@ -114,24 +119,25 @@ def __new__(cls, *args, **kwargs):
             "get_batch": get_cache_mode_batch if use_cache else get_batch,
             "read_calibration_cache": read_calibration_cache,
             "write_calibration_cache": write_calibration_cache,
+            "__reduce__": __reduce__ # used when you deepcopy the DataLoaderCalibrator object
         }
 
         # Using type metaclass to construct calibrator class based on algorithm type
         if algo_type == CalibrationAlgo.ENTROPY_CALIBRATION:
             return type(
-                "DataLoaderCalibrator", (_C.IInt8EntropyCalibrator,), attribute_mapping
+                "Int8EntropyCalibrator", (_C.IInt8EntropyCalibrator,), attribute_mapping
             )()
         elif algo_type == CalibrationAlgo.ENTROPY_CALIBRATION_2:
             return type(
-                "DataLoaderCalibrator", (_C.IInt8MinMaxCalibrator,), attribute_mapping
+                "Int8EntropyCalibrator2", (_C.IInt8EntropyCalibrator2,), attribute_mapping
             )()
         elif algo_type == CalibrationAlgo.LEGACY_CALIBRATION:
             return type(
-                "DataLoaderCalibrator", (_C.IInt8LegacyCalibrator,), attribute_mapping
+                "Int8LegacyCalibrator", (_C.IInt8LegacyCalibrator,), attribute_mapping
             )()
         elif algo_type == CalibrationAlgo.MINMAX_CALIBRATION:
             return type(
-                "DataLoaderCalibrator", (_C.IInt8MinMaxCalibrator,), attribute_mapping
+                "Int8MinMaxCalibrator", (_C.IInt8MinMaxCalibrator,), attribute_mapping
             )()
         else:
             log(
diff --git a/py/torch_tensorrt/ts/_compile_spec.py b/py/torch_tensorrt/ts/_compile_spec.py
index 6f267978ec..9616111caa 100644
--- a/py/torch_tensorrt/ts/_compile_spec.py
+++ b/py/torch_tensorrt/ts/_compile_spec.py
@@ -226,13 +226,7 @@ def _parse_input_signature(input_signature: Any):
 
 def _parse_compile_spec(compile_spec_: Dict[str, Any]) -> _ts_C.CompileSpec:
     # TODO: Use deepcopy to support partial compilation of collections
-    compile_spec = {}
-    for k, v in compile_spec_.items():
-        if k != "calibrator":
-            compile_spec[k] = deepcopy(v)
-        else:
-            compile_spec[k] = v
-
+    compile_spec = deepcopy(compile_spec_)
     info = _ts_C.CompileSpec()
 
     if len(compile_spec["inputs"]) > 0:
diff --git a/tests/py/ptq/test_ptq_dataloader_calibrator.py b/tests/py/ptq/test_ptq_dataloader_calibrator.py
index 2ee1fa5b08..79c19dadbf 100644
--- a/tests/py/ptq/test_ptq_dataloader_calibrator.py
+++ b/tests/py/ptq/test_ptq_dataloader_calibrator.py
@@ -81,9 +81,6 @@ def test_compile_script(self):
             device=torch.device("cuda:0"),
         )
 
-        fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
-        log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))
-
         compile_spec = {
             "inputs": [torchtrt.Input([1, 3, 32, 32])],
             "enabled_precisions": {torch.float, torch.int8},
@@ -96,8 +93,11 @@ def test_compile_script(self):
                 "allow_gpu_fallback": False,
             },
         }
-
         trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
+
+        fp32_test_acc = compute_accuracy(self.testing_dataloader, self.model)
+        log(Level.Info, "[Pyt FP32] Test Acc: {:.2f}%".format(100 * fp32_test_acc))
+
         int8_test_acc = compute_accuracy(self.testing_dataloader, trt_mod)
         log(Level.Info, "[TRT INT8] Test Acc: {:.2f}%".format(100 * int8_test_acc))
         acc_diff = fp32_test_acc - int8_test_acc

From af2076110f6ae4448b56d0d7eb7884d0bc1aef81 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 6 Sep 2022 18:09:20 -0700
Subject: [PATCH 12/12] chore: linter fixes

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
---
 py/torch_tensorrt/ptq.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/py/torch_tensorrt/ptq.py b/py/torch_tensorrt/ptq.py
index 670690b433..e7f3411cd5 100644
--- a/py/torch_tensorrt/ptq.py
+++ b/py/torch_tensorrt/ptq.py
@@ -55,12 +55,14 @@ def write_calibration_cache(self, cache):
     else:
         return b""
 
+
 # deepcopy (which involves pickling) is performed on the compile_spec internally during compilation.
 # We register this __reduce__ function for pickler to identity the calibrator object returned by DataLoaderCalibrator during deepcopy.
 # This should be the object's local name relative to the module https://docs.python.org/3/library/pickle.html#object.__reduce__
 def __reduce__(self):
     return self.__class__.__name__
 
+
 class DataLoaderCalibrator(object):
     """
     Constructs a calibrator class in TensorRT and uses pytorch dataloader to load/preproces
@@ -119,7 +121,7 @@ def __new__(cls, *args, **kwargs):
             "get_batch": get_cache_mode_batch if use_cache else get_batch,
             "read_calibration_cache": read_calibration_cache,
             "write_calibration_cache": write_calibration_cache,
-            "__reduce__": __reduce__ # used when you deepcopy the DataLoaderCalibrator object
+            "__reduce__": __reduce__,  # used when you deepcopy the DataLoaderCalibrator object
         }
 
         # Using type metaclass to construct calibrator class based on algorithm type
@@ -129,7 +131,9 @@ def __new__(cls, *args, **kwargs):
             )()
         elif algo_type == CalibrationAlgo.ENTROPY_CALIBRATION_2:
             return type(
-                "Int8EntropyCalibrator2", (_C.IInt8EntropyCalibrator2,), attribute_mapping
+                "Int8EntropyCalibrator2",
+                (_C.IInt8EntropyCalibrator2,),
+                attribute_mapping,
             )()
         elif algo_type == CalibrationAlgo.LEGACY_CALIBRATION:
             return type(