Update on "Add generic fake quantized linear for QAT"

andrewor14 · andrewor14 · commit 5642f44434af · 2024-10-14T13:26:29.000-07:00
**Summary:** This commit adds a generic fake quantized linear module
to replace the uses of the existing more specific QAT linears.
For example, `Int8DynActInt4WeightQATLinear` can be expressed
as follows:

```
from torchao.quantization.prototype.qat.api import FakeQuantizeConfig
from torchao.quantization.prototype.qat.linear import FakeQuantizedLinear

activation_config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
weight_config = FakeQuantizeConfig(torch.int4, group_size=8)
fq_linear = FakeQuantizedLinear(16, 32, False, activation_config, weight_config)
```

The main motivation is to provide a more flexible way to perform
QAT on models with linear layers. Previously, we would have to
create a new linear class every time we wish to experiment with
different fake quantization settings, e.g. different group size
or different bit width. Now we can express this easily using a
single linear module.

**Test Plan:**
python test/quantization/test_qat.py -k test_fake_quantize_config_granularity
python test/quantization/test_qat.py -k test_fake_quantize_config_granularity_error_cases
python test/quantization/test_qat.py -k test_fake_quantize_config_mapping_type
python test/quantization/test_qat.py -k test_fake_quantized_linear_8da4w
python test/quantization/test_qat.py -k test_fake_quantized_linear_4w

[ghstack-poisoned]
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -1126,6 +1126,7 @@ def test_shape_logger(self):
 class SmoothquantIntegrationTest(unittest.TestCase):
     @torch.no_grad()
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_3, "newer dtypes not supported")
     def test_non_dynamically_quantizable_linear(self):
         if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 0):
             self.skipTest("test requires SM capability of at least (8, 0).")
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -58,6 +58,7 @@
     groupwise_affine_quantize_tensor,
 )
 from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_3,
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_5,
 )
@@ -753,14 +754,15 @@ def test_fake_quantize_config_dtype(self):
         with self.assertRaisesRegex(ValueError, msg):
             FakeQuantizeConfig(torch.float32, "per_token")
         # OK
-        FakeQuantizeConfig(torch.uint1, "per_token")
-        FakeQuantizeConfig(torch.uint2, "per_token")
-        FakeQuantizeConfig(torch.uint3, "per_token")
-        FakeQuantizeConfig(torch.uint4, "per_token")
-        FakeQuantizeConfig(torch.uint5, "per_token")
-        FakeQuantizeConfig(torch.uint6, "per_token")
-        FakeQuantizeConfig(torch.uint7, "per_token")
-        FakeQuantizeConfig(torch.uint8, "per_token")
+        if TORCH_VERSION_AT_LEAST_2_3:
+            FakeQuantizeConfig(torch.uint1, "per_token")
+            FakeQuantizeConfig(torch.uint2, "per_token")
+            FakeQuantizeConfig(torch.uint3, "per_token")
+            FakeQuantizeConfig(torch.uint4, "per_token")
+            FakeQuantizeConfig(torch.uint5, "per_token")
+            FakeQuantizeConfig(torch.uint6, "per_token")
+            FakeQuantizeConfig(torch.uint7, "per_token")
+            FakeQuantizeConfig(torch.uint8, "per_token")
         FakeQuantizeConfig(TorchAODType.INT1, "per_token")
         FakeQuantizeConfig(TorchAODType.INT2, "per_token")
         FakeQuantizeConfig(TorchAODType.INT3, "per_token")