Moving all tensor allocation from cpu to meta device in SplitTableBatchedEmbeddingBagsCodegen

Sasha Fomina · facebook-github-bot · commit 40cb9fecf000 · 2021-07-09T12:25:15.000-07:00
Summary: Used profiler logs sorted by `cpu_memory_usage`  in `embedding_bag_wprofiler_gpu_test.py` to add device kwarg to all  tensor allocation sites in `split_table_batched_embeddings_ops.py`, so that they can be materialized on the meta device. Some `torch.tensor` calls switched out for `torch.zeros` calls (where appropriate)  to avoid temporary allocation on CPU memory. Still, some `torch.tensor` calls were kept with temp. CPU memory allocation but with final materialization on the meta device.

Reviewed By: xush6528

Differential Revision: D29566376

fbshipit-source-id: c01575127cb2392f95ec1d3712ad43803a373db5
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -196,7 +196,7 @@ def __init__(  # noqa C901
         self.weights_precision = weights_precision
         self.record_cache_metrics = record_cache_metrics
         # NOTE: a placeholder to avoid multi-construction and make TorchScript work!
-        self.dummy_tensor: Tensor = torch.tensor(0)
+        self.dummy_tensor: Tensor = torch.zeros(0, device=device)
 
         self.embedding_specs = embedding_specs
         (rows, dims, locations, compute_devices) = zip(*embedding_specs)
@@ -373,30 +373,30 @@ def __init__(  # noqa C901
                 prefix="momentum2",
                 dtype=torch.float32,
             )
-            self.register_buffer("iter", torch.tensor([0], dtype=torch.int64))
+            self.register_buffer("iter", torch.zeros(1, dtype=torch.int64, device=self.current_device))
         else:
             # NOTE: make TorchScript work!
             self.register_buffer(
-                "momentum2_dev", torch.tensor([0], dtype=torch.int64), persistent=False
+                "momentum2_dev", torch.zeros(1, dtype=torch.int64, device=self.current_device), persistent=False
             )
             self.register_buffer(
-                "momentum2_host", torch.tensor([0], dtype=torch.int64), persistent=False
+                "momentum2_host", torch.zeros(1, dtype=torch.int64, device=self.current_device), persistent=False
             )
             self.register_buffer(
-                "momentum2_uvm", torch.tensor([0], dtype=torch.int64), persistent=False
+                "momentum2_uvm", torch.zeros(1, dtype=torch.int64, device=self.current_device), persistent=False
             )
             self.register_buffer(
                 "momentum2_placements",
-                torch.tensor([0], dtype=torch.int64),
+                torch.zeros(1, dtype=torch.int64, device=self.current_device),
                 persistent=False,
             )
             self.register_buffer(
                 "momentum2_offsets",
-                torch.tensor([0], dtype=torch.int64),
+                torch.zeros(1, dtype=torch.int64, device=self.current_device),
                 persistent=False,
             )
             self.register_buffer(
-                "iter", torch.tensor([0], dtype=torch.int64), persistent=False
+                "iter", torch.zeros(1, dtype=torch.int64, device=self.current_device), persistent=False
             )
 
         cache_state = construct_cache_state(embedding_specs, self.feature_table_map)
@@ -983,27 +983,27 @@ def _apply_cache_state(
             # NOTE: make TorchScript work!
             self.register_buffer(
                 "cache_hash_size_cumsum",
-                torch.tensor([0], dtype=torch.int64),
+                torch.zeros(1, dtype=torch.int64, device=self.current_device),
                 persistent=False,
             )
             self.register_buffer(
                 "total_cache_hash_size",
-                torch.tensor([0], dtype=torch.int64),
+                torch.zeros(1, dtype=torch.int64, device=self.current_device),
                 persistent=False,
             )
             self.register_buffer(
                 "cache_index_table_map",
-                torch.tensor([0], dtype=torch.int64),
+                torch.zeros(1, dtype=torch.int64, device=self.current_device),
                 persistent=False,
             )
             self.register_buffer(
                 "lxu_cache_state",
-                torch.tensor([0], dtype=torch.int64),
+                torch.zeros(1, dtype=torch.int64, device=self.current_device),
                 persistent=False,
             )
             self.register_buffer(
                 "lxu_state",
-                torch.tensor([0], dtype=torch.int64),
+                torch.zeros(1, dtype=torch.int64, device=self.current_device),
                 persistent=False,
             )
             self.register_buffer(