Update on "Add generic fake quantized linear for QAT"

andrewor14 · andrewor14 · commit fbc0259f9c0c · 2024-10-14T11:02:17.000-07:00
**Summary:** This commit adds a generic fake quantized linear module
to replace the uses of the existing more specific QAT linears.
For example, `Int8DynActInt4WeightQATLinear` can be expressed
as follows:

```
from torchao.quantization.prototype.qat.api import FakeQuantizeConfig
from torchao.quantization.prototype.qat.linear import FakeQuantizedLinear

activation_config = FakeQuantizeConfig(torch.int8, "per_token", is_symmetric=False)
weight_config = FakeQuantizeConfig(torch.int4, group_size=8)
fq_linear = FakeQuantizedLinear(16, 32, False, activation_config, weight_config)
```

The main motivation is to provide a more flexible way to perform
QAT on models with linear layers. Previously, we would have to
create a new linear class every time we wish to experiment with
different fake quantization settings, e.g. different group size
or different bit width. Now we can express this easily using a
single linear module.

**Test Plan:**
python test/quantization/test_qat.py -k test_fake_quantize_config_granularity
python test/quantization/test_qat.py -k test_fake_quantize_config_granularity_error_cases
python test/quantization/test_qat.py -k test_fake_quantize_config_mapping_type
python test/quantization/test_qat.py -k test_fake_quantized_linear_8da4w
python test/quantization/test_qat.py -k test_fake_quantized_linear_4w

[ghstack-poisoned]
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -106,14 +106,24 @@ class TorchAODType(Enum):
     TorchAODType.INT5: 5,
     TorchAODType.INT6: 6,
     TorchAODType.INT7: 7,
+    torch.uint8: 8,
     torch.int8: 8,
     torch.int16: 16,
     torch.int32: 32,
 }
 
 _SUB_BYTE_UINT_BOUNDS: Dict[Union[torch.dtype, TorchAODType], Tuple[int, int]] = {}
-_SUB_BYTE_INT_BOUNDS: Dict[Union[torch.dtype, TorchAODType], Tuple[int, int]] = {}
+_SUB_BYTE_INT_BOUNDS: Dict[Union[torch.dtype, TorchAODType], Tuple[int, int]] = {
+    TorchAODType.INT1: (-(2**0), 2**0 - 1),
+    TorchAODType.INT2: (-(2**1), 2**1 - 1),
+    TorchAODType.INT3: (-(2**2), 2**2 - 1),
+    TorchAODType.INT4: (-(2**3), 2**3 - 1),
+    TorchAODType.INT5: (-(2**4), 2**4 - 1),
+    TorchAODType.INT6: (-(2**5), 2**5 - 1),
+    TorchAODType.INT7: (-(2**6), 2**6 - 1),
+}
 
+# torch.uintX available only in PyTorch 2.3+
 if TORCH_VERSION_AT_LEAST_2_3:
     _SUB_BYTE_UINT_BOUNDS = {
         torch.uint1: (0, 2**1-1),
@@ -124,18 +134,6 @@ class TorchAODType(Enum):
         torch.uint6: (0, 2**6-1),
         torch.uint7: (0, 2**7-1),
     }
-    _SUB_BYTE_INT_BOUNDS = {
-        TorchAODType.INT1: (-(2**0), 2**0 - 1),
-        TorchAODType.INT2: (-(2**1), 2**1 - 1),
-        TorchAODType.INT3: (-(2**2), 2**2 - 1),
-        TorchAODType.INT4: (-(2**3), 2**3 - 1),
-        TorchAODType.INT5: (-(2**4), 2**4 - 1),
-        TorchAODType.INT6: (-(2**5), 2**5 - 1),
-        TorchAODType.INT7: (-(2**6), 2**6 - 1),
-    }
-    _DTYPE_TO_QVALUE_BOUNDS.update(_SUB_BYTE_UINT_BOUNDS)
-    _DTYPE_TO_QVALUE_BOUNDS.update(_SUB_BYTE_INT_BOUNDS)
-
     _DTYPE_TO_BIT_WIDTH.update({
         torch.uint1: 1,
         torch.uint2: 2,
@@ -144,9 +142,10 @@ class TorchAODType(Enum):
         torch.uint5: 5,
         torch.uint6: 6,
         torch.uint7: 7,
-        torch.uint8: 8,
     })
 
+_DTYPE_TO_QVALUE_BOUNDS.update(_SUB_BYTE_UINT_BOUNDS)
+_DTYPE_TO_QVALUE_BOUNDS.update(_SUB_BYTE_INT_BOUNDS)
 assert _DTYPE_TO_BIT_WIDTH.keys() == _DTYPE_TO_QVALUE_BOUNDS.keys()
 
 _ONES_TABLE = [_n_ones(i) for i in range(8)]