More types for fbgemm_gpu (#585)

r-barnes · facebook-github-bot · commit c565348fdc8a · 2021-04-05T17:45:59.000-07:00
Summary: Pull Request resolved: #585 Reviewed By: xush6528 Differential Revision: D27550481 fbshipit-source-id: 3eccfa359cd8f983b7c74a7c2f22c75024779f3b
diff --git a/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py b/fbgemm_gpu/bench/split_table_batched_embeddings_benchmark.py
@@ -20,6 +20,7 @@
     SparseType,
     SplitTableBatchedEmbeddingBagsCodegen,
 )
+from torch import Tensor
 
 logging.basicConfig(level=logging.DEBUG)
 
@@ -43,8 +44,8 @@ def get_device() -> torch.device:
 # Merged indices with shape (T, B, L) -> (flattened indices with shape
 # (T * B * L), offsets with shape (T * B + 1))
 def get_table_batched_offsets_from_dense(
-    merged_indices: torch.Tensor,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+    merged_indices: Tensor,
+) -> Tuple[Tensor, Tensor]:
     (T, B, L) = merged_indices.size()
     lengths = np.ones((T, B)) * L
     flat_lengths = lengths.flatten()
@@ -67,7 +68,7 @@ def generate_requests(
     alpha: float = 1.0,
     weights_precision: SparseType = SparseType.FP32,
     weighted: bool = False,
-) -> List[Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]]:
+) -> List[Tuple[Tensor, Tensor, Optional[Tensor]]]:
     if alpha <= 1.0:
         all_indices = torch.randint(
             low=0,
@@ -111,9 +112,8 @@ def generate_requests(
 
 
 def benchmark_requests(
-    requests: List[Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
-    # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
-    f: Callable,
+    requests: List[Tuple[Tensor, Tensor, Optional[Tensor]]],
+    func: Callable[[Tensor, Tensor, Optional[Tensor]], Tensor],
 ) -> float:
     if torch.cuda.is_available():
         torch.cuda.synchronize()
@@ -123,7 +123,7 @@ def benchmark_requests(
     else:
         start_time = time.time()
     for (indices, offsets, weights) in requests:
-        f(indices, offsets, weights)
+        func(indices, offsets, weights)
     if torch.cuda.is_available():
         end_event.record()
         torch.cuda.synchronize()
@@ -133,11 +133,9 @@ def benchmark_requests(
 
 
 def benchmark_pipelined_requests(
-    requests: List[Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]],
-    # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
-    f: Callable,
-    # pyre-fixme[24]: Generic type `Callable` expects 2 type parameters.
-    g: Callable,
+    requests: List[Tuple[Tensor, Tensor, Optional[Tensor]]],
+    func1: Callable[[Tensor, Tensor, Optional[Tensor]], None],
+    func2: Callable[[Tensor, Tensor, Optional[Tensor]], None],
 ) -> Tuple[float, float]:
     torch.cuda.synchronize()
     start_events = [
@@ -152,10 +150,10 @@ def benchmark_pipelined_requests(
         requests, start_events, end_events
     ):
         start_event[0].record()
-        f(indices, offsets, indices_weights)
+        func1(indices, offsets, indices_weights)
         end_event[0].record()
         start_event[1].record()
-        g(indices, offsets, indices_weights)
+        func2(indices, offsets, indices_weights)
         end_event[1].record()
     torch.cuda.synchronize()
     return (
diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops.py
@@ -27,6 +27,10 @@
 INT8_EMB_ROW_DIM_OFFSET = 8
 
 
+class DoesNotHavePrefix(Exception):
+    pass
+
+
 class EmbeddingLocation(enum.IntEnum):
     DEVICE = 0
     MANAGED = 1
@@ -420,10 +424,9 @@ def __init__(  # noqa C901
 
         self.step = 0
 
-    # pyre-fixme[3]: Return type must be annotated.
-    def get_states(self, prefix: str):
+    def get_states(self, prefix: str) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
         if not hasattr(self, f"{prefix}_physical_placements"):
-            return None
+            raise DoesNotHavePrefix()
         dev_param = getattr(self, f"{prefix}_dev")
         host_param = getattr(self, f"{prefix}_host")
         uvm_param = getattr(self, f"{prefix}_uvm")
@@ -437,14 +440,13 @@ def get_states(self, prefix: str):
             torch.tensor(offsets, dtype=torch.int64),
         )
 
-    # pyre-fixme[24]: Generic type `list` expects 1 type parameter, use
-    #  `typing.List` to avoid runtime subscripting errors.
-    def get_all_states(self) -> List:
+    def get_all_states(self) -> List[Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]]:
         all_states = []
         for prefix in ["weights", "momentum1", "momentum2"]:
-            states = self.get_states(prefix)
-            if states:
-                all_states.append(states)
+            try:
+                all_states.append(self.get_states(prefix))
+            except DoesNotHavePrefix:
+                pass
         return all_states
 
     def forward(
@@ -741,16 +743,11 @@ def split_optimizer_states(self) -> List[Tuple[torch.Tensor]]:
         """
 
         def get_optimizer_states(
-            # pyre-fixme[2]: Parameter must be annotated.
-            state_dev,
-            # pyre-fixme[2]: Parameter must be annotated.
-            state_host,
-            # pyre-fixme[2]: Parameter must be annotated.
-            state_uvm,
-            # pyre-fixme[2]: Parameter must be annotated.
-            state_offsets,
-            # pyre-fixme[2]: Parameter must be annotated.
-            state_placements,
+            state_dev: Tensor,
+            state_host: Tensor,
+            state_uvm: Tensor,
+            state_offsets: Tensor,
+            state_placements: Tensor,
             rowwise: bool,
         ) -> List[torch.Tensor]:
             splits = []
@@ -872,9 +869,7 @@ def flush(self) -> None:
             self.stochastic_rounding,
         )
 
-    # pyre-fixme[2]: Parameter must be annotated.
-    # pyre-fixme[2]: Parameter must be annotated.
-    def _apply_split(self, split, prefix, dtype: torch.dtype, enforce_hbm: bool = False) -> None:
+    def _apply_split(self, split: SplitState, prefix: str, dtype: torch.dtype, enforce_hbm: bool = False) -> None:
         setattr(self, f"{prefix}_physical_placements", split.placements)
         setattr(self, f"{prefix}_physical_offsets", split.offsets)
 
@@ -1184,8 +1179,7 @@ def __init__(
                 row for (row, _) in embedding_specs[:t]
             )
 
-        # pyre-fixme[4]: Attribute must be annotated.
-        self.weights_physical_offsets = weights_offsets
+        self.weights_physical_offsets: List[int] = weights_offsets
         weights_offsets = [weights_offsets[t] for t in feature_table_map]
         self.register_buffer(
             "weights_offsets",
diff --git a/fbgemm_gpu/test/split_table_batched_embeddings_test.py b/fbgemm_gpu/test/split_table_batched_embeddings_test.py
@@ -9,17 +9,33 @@
 
 import copy
 import unittest
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, List, Optional, Tuple, TypeVar
 
 import fbgemm_gpu.split_table_batched_embeddings_ops as split_table_batched_embeddings_ops
 import hypothesis.strategies as st
 import numpy as np
 import torch
 from fbgemm_gpu.split_table_batched_embeddings_ops import OptimType, SparseType
+from torch import Tensor
 from hypothesis import HealthCheck, Verbosity, assume, given, settings
 
 
 MAX_EXAMPLES = 40
+Deviceable = TypeVar("Deviceable", torch.nn.EmbeddingBag, Tensor)
+
+
+def b_indices(
+    b: torch.nn.EmbeddingBag,
+    x: torch.Tensor,
+    per_sample_weights: Optional[torch.Tensor] = None,
+    use_cpu: bool = False
+) -> torch.Tensor:
+    (indices, offsets) = get_offsets_from_dense(x)
+    return b(
+        to_device(indices, use_cpu),
+        to_device(offsets, use_cpu),
+        per_sample_weights=per_sample_weights,
+    )
 
 
 def div_round_up(a: int, b: int) -> int:
@@ -35,8 +51,9 @@ def get_offsets_from_dense(indices: torch.Tensor) -> Tuple[torch.Tensor, torch.T
         ),
     )
 
-
-def to_device(t: torch.Tensor, use_cpu: bool) -> torch.Tensor:
+def to_device(t: Deviceable, use_cpu: bool) -> Deviceable:
+    # pyre-fixme[7]: Expected `Deviceable` but got `Union[Tensor,
+    #  torch.nn.EmbeddingBag]`.
     return t.cpu() if use_cpu else t.cuda()
 
 
@@ -239,11 +256,9 @@ def test_forward(
             xws = [xw.half() for xw in xws]
 
         fs = (
-            # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
             [b_indices(b, x, use_cpu=use_cpu) for (b, x) in zip(bs, xs)]
             if not weighted
             else [
-                # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
                 b_indices(b, x, per_sample_weights=xw.view(-1), use_cpu=use_cpu)
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
@@ -270,7 +285,6 @@ def test_forward(
         cc = torch.jit.script(cc)
 
         for t in range(T):
-            # pyre-fixme[16]: `Tensor` has no attribute `weight`.
             cc.split_embedding_weights()[t].data.copy_(bs[t].weight)
 
         x = torch.cat([x.view(1, B, L) for x in xs], dim=0)
@@ -385,19 +399,16 @@ def test_backward_dense(
             xws = [xw.half() for xw in xws]
 
         fs = (
-            # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
             [b_indices(b, x, use_cpu=use_cpu) for (b, x) in zip(bs, xs)]
             if not weighted
             else [
-                # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
                 b_indices(b, x, per_sample_weights=xw.view(-1), use_cpu=use_cpu)
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
         )
         gos = [torch.randn_like(f) for f in fs]
         [f.backward(go) for (f, go) in zip(fs, gos)]
 
-        # pyre-fixme[16]: `Tensor` has no attribute `weight`.
         grad_weights = torch.cat([b.weight.grad.view(-1) for b in bs])
         if weights_precision == SparseType.FP16 and not use_cpu:
             grad_weights = grad_weights.half()
@@ -570,7 +581,6 @@ def test_backward_sgd(  # noqa C901
         feature_table_map = list(range(T))
         if exact:
             table_to_replicate = T // 2
-            # pyre-fixme[6]: Expected `HalfTensor` for 2nd param but got `Tensor`.
             bs.insert(table_to_replicate, bs[table_to_replicate])
             feature_table_map.insert(table_to_replicate, table_to_replicate)
 
@@ -598,11 +608,9 @@ def test_backward_sgd(  # noqa C901
             xws = [xw.half() for xw in xws]
 
         fs = (
-            # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
             [b_indices(b, x, use_cpu=use_cpu) for (b, x) in zip(bs, xs)]
             if not weighted
             else [
-                # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
                 b_indices(b, x, per_sample_weights=xw.view(-1), use_cpu=use_cpu)
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
@@ -613,7 +621,6 @@ def test_backward_sgd(  # noqa C901
         lr = 0.05
         if exact:
             del bs[table_to_replicate]
-        # pyre-fixme[16]: `Tensor` has no attribute `weight`.
         new_weights = [(b.weight - b.weight.grad * lr) for b in bs]
 
         cc = split_table_batched_embeddings_ops.SplitTableBatchedEmbeddingBagsCodegen(
@@ -782,7 +789,6 @@ def test_backward_adagrad(  # noqa C901
         if exact:
             # autograd with shared embedding only works for exact
             table_to_replicate = T // 2
-            # pyre-fixme[6]: Expected `HalfTensor` for 2nd param but got `Tensor`.
             bs.insert(table_to_replicate, bs[table_to_replicate])
             feature_table_map.insert(table_to_replicate, table_to_replicate)
 
@@ -805,11 +811,9 @@ def test_backward_adagrad(  # noqa C901
             xws = [xw.half() for xw in xws]
 
         fs = (
-            # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
             [b_indices(b, x, use_cpu=use_cpu) for (b, x) in zip(bs, xs)]
             if not weighted
             else [
-                # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
                 b_indices(b, x, per_sample_weights=xw.view(-1), use_cpu=use_cpu)
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
@@ -839,7 +843,6 @@ def test_backward_adagrad(  # noqa C901
         if exact:
             del bs[table_to_replicate]
         for t in range(T):
-            # pyre-fixme[16]: `Tensor` has no attribute `weight`.
             cc.split_embedding_weights()[t].data.copy_(bs[t].weight)
 
         x = torch.cat([x.view(1, B, L) for x in xs], dim=0)
@@ -1162,11 +1165,9 @@ def test_backward_optimizers(  # noqa C901
         xws_acc_type = copy.deepcopy(xws)
 
         fs = (
-            # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
             [b_indices(b, x, use_cpu=use_cpu) for (b, x) in zip(bs, xs)]
             if not weighted
             else [
-                # pyre-fixme[6]: Expected `(...) -> Any` for 1st param but got `Tensor`.
                 b_indices(b, x, per_sample_weights=xw.view(-1), use_cpu=use_cpu)
                 for (b, x, xw) in zip(bs, xs, xws)
             ]
@@ -1214,7 +1215,6 @@ def test_backward_optimizers(  # noqa C901
         )
 
         for t in range(T):
-            # pyre-fixme[16]: `Tensor` has no attribute `weight`.
             cc.split_embedding_weights()[t].data.copy_(bs[t].weight)
 
         x = torch.cat([x.view(1, B, L) for x in xs], dim=0)