pytorch
diff --git a/‎test/dtypes/test_fbgemm_fp8.py‎
Lines changed: 54 additions & 10 deletions b/‎test/dtypes/test_fbgemm_fp8.py‎
Lines changed: 54 additions & 10 deletions
diff --git a/‎test/dtypes/test_fbgemm_int4.py‎
Lines changed: 48 additions & 1 deletion b/‎test/dtypes/test_fbgemm_int4.py‎
Lines changed: 48 additions & 1 deletion
diff --git a/‎torchao/core/config.py‎
Lines changed: 8 additions & 0 deletions b/‎torchao/core/config.py‎
Lines changed: 8 additions & 0 deletions
@@ -12,7 +12,6 @@
     run_tests,
 )
 
-from torchao.float8.config import e4m3_dtype
 from torchao.quantization import (
     FbgemmConfig,
     quantize_,
@@ -29,17 +28,12 @@
 @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
 class TestFbgemmFp8Tensor(TestCase):
     def setUp(self):
+        self.e4m3_dtype = torch.float8_e4m3fn
         self.config = FbgemmConfig(
-            input_dtype=e4m3_dtype,
-            weight_dtype=e4m3_dtype,
+            input_dtype=self.e4m3_dtype,
+            weight_dtype=self.e4m3_dtype,
             output_dtype=torch.bfloat16,
         )
-        self.bmm_config = FbgemmConfig(
-            input_dtype=e4m3_dtype,
-            weight_dtype=e4m3_dtype,
-            output_dtype=torch.bfloat16,
-            transpose_input=True,
-        )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
     def test_linear(self):
@@ -128,7 +122,9 @@ def forward(self, x):
         weight = torch.randn(10, 128, 256, dtype=dtype, device=device)
         m = M(weight).eval()
         original = m(input)
-        quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
+        # we need to transpose the weight first for bmm
+        m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
+        quantize_(m, self.config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 20)
 
@@ -146,6 +142,54 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        quantize_(dummy1, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.float8_data, cat_qweight1.float8_data)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+
+        # concat with dim == 1 is not really correct and will be fixed later
+        # when we support distributed checkpointing
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        ref_float8_data = torch.cat(
+            [linear1.weight.float8_data, linear2.weight.float8_data], dim=1
+        )
+        ref_scale = linear1.weight.scale
+        self.assertEqual(cat_qweight2.float8_data, ref_float8_data)
+        self.assertEqual(cat_qweight2.scale, ref_scale)
+
+    def test_transpose(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        linear1.bias = torch.nn.Parameter(torch.randn(128, dtype=dtype, device=device))
+        self.assertTrue(linear1.weight.shape, (128, 256))
+
+        input = torch.randn(32, 256, dtype=dtype, device=device)
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 128))
+
 
 if __name__ == "__main__":
     run_tests()
@@ -39,7 +39,6 @@ def setUp(self):
             weight_dtype=torch.int4,
             output_dtype=torch.bfloat16,
             block_size=[1, 1, 128],
-            transpose_input=True,
         )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
@@ -134,6 +133,7 @@ def forward(self, x):
         weight = torch.randn(10, 128, 256, dtype=dtype, device=device)
         m = M(weight).eval()
         original = m(input)
+        m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
         quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)
@@ -152,6 +152,53 @@ def test_to_device(self):
             quantize_(linear, self.config)
             linear.to(device)
 
+    def test_cat(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=dtype)
+        # weight: (256, 128)
+        linear2 = torch.nn.Linear(128, 256, dtype=dtype)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        cat_weight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        dummy1 = torch.nn.Linear(128, 512, bias=False, dtype=dtype, device=device)
+        dummy2 = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
+
+        dummy1.weight = torch.nn.Parameter(cat_weight1)
+        dummy2.weight = torch.nn.Parameter(cat_weight2)
+        quantize_(dummy1, self.config)
+        quantize_(dummy2, self.config)
+
+        quantize_(linear1, self.config)
+        quantize_(linear2, self.config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (512, 128))
+        self.assertEqual(dummy1.weight.packed_weight, cat_qweight1.packed_weight)
+        self.assertEqual(dummy1.weight.scale, cat_qweight1.scale)
+        self.assertEqual(dummy1.weight.zero_point, cat_qweight1.zero_point)
+
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (256, 256))
+        self.assertEqual(dummy2.weight.packed_weight, cat_qweight2.packed_weight)
+        self.assertEqual(dummy2.weight.scale, cat_qweight2.scale)
+        self.assertEqual(dummy2.weight.zero_point, cat_qweight2.zero_point)
+
+    def test_transpose(self):
+        # weight: (256, 128)
+        linear1 = torch.nn.Linear(128, 256, dtype=torch.bfloat16, device="cuda")
+        quantize_(linear1, self.config)
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        # transpose again to return to the original state
+        linear1.weight = torch.nn.Parameter(linear1.weight.transpose(0, 1).contiguous())
+        self.assertTrue(linear1.weight.shape, (256, 128))
+
+        input = torch.randn(32, 128, dtype=torch.bfloat16, device="cuda")
+        # make sure it runs
+        res = linear1(input)
+        self.assertTrue(res.shape, (32, 256))
+
 
 if __name__ == "__main__":
     run_tests()
@@ -12,6 +12,14 @@
 
 import torch
 
+__all__ = [
+    "AOBaseConfig",
+    "VersionMismatchError",
+    "config_to_dict",
+    "config_from_dict",
+    "ALLOWED_AO_MODULES",
+]
+
 
 class AOBaseConfig(abc.ABC):
     """