pytorch
diff --git a/‎test/dtypes/test_affine_quantized_tensor_parallel.py‎
Lines changed: 12 additions & 0 deletions b/‎test/dtypes/test_affine_quantized_tensor_parallel.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎test/integration/test_integration.py‎
Lines changed: 34 additions & 0 deletions b/‎test/integration/test_integration.py‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎torchao/_models/llama/benchmark_results.txt‎
Lines changed: 92 additions & 0 deletions b/‎torchao/_models/llama/benchmark_results.txt‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎torchao/_models/llama/benchmarks.sh‎
Lines changed: 35 additions & 40 deletions b/‎torchao/_models/llama/benchmarks.sh‎
Lines changed: 35 additions & 40 deletions
@@ -138,9 +138,21 @@ class TestInt4woAffineQuantizedTensorParallel(TestAffineQuantizedTensorParallel)
     def test_tp(self, dtype):
         return self._test_tp(dtype)
 
+class TestGemliteLayoutTensorParallel(TestAffineQuantizedTensorParallel):
+    from torchao.quantization import gemlite_uintx_weight_only
+    QUANT_METHOD_FN = staticmethod(gemlite_uintx_weight_only)
+    COMMON_DTYPES = [torch.float16]
+
+    @common_utils.parametrize("dtype", COMMON_DTYPES)
+    @with_comms
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    def test_tp_gemlite(self, dtype):
+        return self._test_tp(dtype)
+
 
 common_utils.instantiate_parametrized_tests(TestInt8woAffineQuantizedTensorParallel)
 common_utils.instantiate_parametrized_tests(TestInt4woAffineQuantizedTensorParallel)
+common_utils.instantiate_parametrized_tests(TestGemliteLayoutTensorParallel)
 
 # Run only on H100
 if torch.cuda.is_available() and torch.cuda.get_device_capability() >= (9, 0):
 
@@ -96,6 +96,12 @@
 )
 from torchao.dtypes.utils import is_device
 
+try:
+    import gemlite
+    has_gemlite = True
+except ModuleNotFoundError:
+    has_gemlite = False
+
 logger = logging.getLogger("INFO")
 
 torch.manual_seed(0)
@@ -870,6 +876,9 @@ def _test_lin_weight_subclass_api_impl(
         ref_f = mod(x)
         api(mod)
 
+        # test get_plain()
+        mod[0].weight.tensor_impl.get_plain()
+
         test = mod(x)
         self.assertGreater(
             SQNR(ref_f, test),
@@ -930,6 +939,31 @@ def test_int4_weight_only_quant_subclass_api(self, device, dtype):
                 test_dtype=dtype
             )
 
+    @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_5, "gemlite tests needs torch 2.5 or greater")
+    @unittest.skipIf(not has_gemlite, "gemlite not available")
+    def test_gemlite_layout(self, device, dtype):
+        if dtype!= torch.float16:
+            self.skipTest(f"gemlite only works for fp16 dtype")
+        from torchao.quantization import gemlite_uintx_weight_only
+        if device == "cpu":
+            self.skipTest(f"gemlite is for cuda, not {device}")
+        for packing_bitwidth in [32, 8]:
+
+            for bit_width in [4,8]:
+                for group_size in [64, 32, None] if bit_width ==4 else [None]:
+                    api = lambda mod: quantize_(mod, gemlite_uintx_weight_only(group_size, bit_width, packing_bitwidth))
+                    for test_shape in [[1, 1024, 512],[16, 256, 1024], [128, 256, 1024]]:
+                        print(packing_bitwidth, bit_width, group_size, test_shape, dtype)
+                        self._test_lin_weight_subclass_api_impl(
+                            api,
+                            device,
+                            15, 
+                            test_shape=test_shape,
+                            test_dtype=dtype,
+                        )
+
+
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_3, "int4 requires torch nightly.")
     # @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_5, "int4 skipping 2.5+ for now")
 
@@ -1,15 +1,10 @@
 export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder
 
 # README BENCHMARKS
-export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-4-None  --write_result benchmark_results.txt
+# export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
+
+export MODEL_REPO=meta-llama/Meta-Llama-3-8B
 
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-64  --write_result benchmark_results.txt
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-64  --write_result benchmark_results.txt
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-None  --write_result benchmark_results.txt
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-None  --write_result benchmark_results.txt
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-8-64  --write_result benchmark_results.txt
-# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-8-64  --write_result benchmark_results.txt
 
 # python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-64  --write_result benchmark_results.txt
 # python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-16-4-64  --write_result benchmark_results.txt
@@ -105,7 +100,7 @@ export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
 # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo  --write_result benchmark_results.txt
 # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-None  --write_result benchmark_results.txt
 
-# export MODEL_REPO=meta-llama/Meta-Llama-3-8B
+export MODEL_REPO=meta-llama/Meta-Llama-3-8B
 # # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
 # # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --precision float16
 # # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64  --write_result benchmark_results.txt
@@ -148,16 +143,16 @@ export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
 # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 32
 # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8dq --write_result benchmark_results.txt --batch_size 128
 
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --precision float16
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64  --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-64  --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-64  --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-None  --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-None  --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-8-None  --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-8-None  --write_result benchmark_results.txt
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo  --write_result benchmark_results.txt
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --precision float16
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64  --write_result benchmark_results.txt
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-4-64  --write_result benchmark_results.txt  --num_samples 1
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-32-4-64  --write_result benchmark_results.txt  --num_samples 1
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-4-None  --write_result benchmark_results.txt  --num_samples 1 #not working
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-32-4-None  --write_result benchmark_results.txt --num_samples 1
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-8-None  --write_result benchmark_results.txt --num_samples 1
+python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-32-8-None  --write_result benchmark_results.txt --num_samples 1
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo  --write_result benchmark_results.txt
 # python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-None  --write_result benchmark_results.txt
 
 # # 2:4 sparse model
@@ -169,24 +164,24 @@ python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/mode
 # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 32
 # python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --quantization int8wo --write_result benchmark_results.txt --batch_size 128
 
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --precision float16 --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64  --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-64  --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-64  --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-None  --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-None  --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-8-None  --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-8-None  --write_result benchmark_results.txt --batch_size 8
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo  --write_result benchmark_results.txt --batch_size 8
-
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --precision float16 --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64  --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-64  --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-64  --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-None  --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-None  --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-8-None  --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-8-None  --write_result benchmark_results.txt --batch_size 32
-python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --batch_size 8
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --precision float16 --batch_size 8
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64  --write_result benchmark_results.txt --batch_size 8
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-4-64  --write_result benchmark_results.txt --batch_size 8 --num_samples 1
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-32-4-64  --write_result benchmark_results.txt --batch_size 8 --num_samples 1
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-4-None  --write_result benchmark_results.txt --batch_size 8 --num_samples 1
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-32-4-None  --write_result benchmark_results.txt --batch_size 8 --num_samples 1
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-8-8-None  --write_result benchmark_results.txt --batch_size 8 --num_samples 1
+# python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemlite-32-8-None  --write_result benchmark_results.txt --batch_size 8 --num_samples 1
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo  --write_result benchmark_results.txt --batch_size 8
+
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --write_result benchmark_results.txt --precision float16 --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-64  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-64  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-4-None  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-4-None  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-8-8-None  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --precision float16 --quantization gemsub-32-8-None  --write_result benchmark_results.txt --batch_size 32
+# python generate.py --compile --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo  --write_result benchmark_results.txt --batch_size 32