handle out != None

danielvegamyhre · danielvegamyhre · commit 6ca070de01fa · 2025-07-01T17:04:53.000-07:00
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -40,7 +40,7 @@ def _scaled_grouped_mm(
         offs (int32 torch.Tensor): The offsets to use to mark the starting index of each group along dim0 of the A tensor.
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
     """
-    logger.info("Using scaled_grouped_mm")
+    #logger.info("Using scaled_grouped_mm")
     return _Float8GroupedMM.apply(
         A,
         B_t,
diff --git a/torchao/prototype/moe_training/tensor.py b/torchao/prototype/moe_training/tensor.py
@@ -47,7 +47,6 @@ def __new__(
         cls,
         tensor: torch.Tensor,
     ):
-        # logger.info(f"ScaledGroupedMMTensor __new__: tensor.dtype={tensor.dtype}, dtype: {dtype}, shape: {tensor.shape}")
         return torch.Tensor._make_wrapper_subclass(
             cls,
             tensor.size(),
@@ -155,9 +154,24 @@ def fsdp_post_all_gather(
     ):
         (data,) = all_gather_outputs
 
+        # For training step 1+, out=unshared param, so we need to copy data to `out``
+        # if `self._data`` and `out` do not share the same storage.
+        # Otherwise, if they do share the same storage, we can just return directly.
         if out is not None:
+            assert isinstance(out, ScaledGroupedMMTensor), f"{type(out)}"
+            if data.dtype == param_dtype:
+                assert (
+                    data.untyped_storage().data_ptr()
+                    == out._data.untyped_storage().data_ptr()
+                )
+            else:
+                assert out._data.dtype == param_dtype, (
+                    f"{out._data.dtype} {param_dtype}"
+                )
+                out._data.copy_(data)
             return
 
+        # For training step 0, out=None, so we need to return a new ScaledGroupedMMTensor.
         output = ScaledGroupedMMTensor(data)
         inner_tensors = (data,)
         return output, inner_tensors