pytorch
diff --git a/‎ruff.toml‎
Lines changed: 1 addition & 0 deletions b/‎ruff.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tutorials/add_an_op.py‎
Lines changed: 8 additions & 1 deletion b/‎tutorials/add_an_op.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎tutorials/calibration_flow/awq_like.py‎
Lines changed: 83 additions & 27 deletions b/‎tutorials/calibration_flow/awq_like.py‎
Lines changed: 83 additions & 27 deletions
@@ -6,6 +6,7 @@ include = [
     "torchao/**/*.py",
     "test/**/*.py",
     "benchmarks/**/*.py",
+    "tutorials/**/*.py",
 ]
 
 exclude = [
 
@@ -1,4 +1,5 @@
 import torch
+
 import torchao
 from torchao.dtypes import to_nf4
 
@@ -20,6 +21,7 @@
 # NotImplementedError: NF4Tensor dispatch: attempting to run aten.gelu.default, this is not supported
 # torch.nn.functional.gelu(a_nf4)
 
+
 # Next you can add this function using the implements decorator
 @torchao.dtypes.nf4tensor.implements([torch.ops.aten.gelu.default])
 def gelu(func, *args, **kwargs):
@@ -30,7 +32,12 @@ def gelu(func, *args, **kwargs):
     # We're getting the first argument of the original args
     inp = args[0][0]
     # There's a way very inefficient way to implement it
-    return to_nf4(torch.nn.functional.gelu(inp.to(torch.float32)), inp.block_size, inp.scaler_block_size)
+    return to_nf4(
+        torch.nn.functional.gelu(inp.to(torch.float32)),
+        inp.block_size,
+        inp.scaler_block_size,
+    )
+
 
 print(f"gelu(a): {torch.nn.functional.gelu(a)}")
 print(f"gelu(a_nf4): {torch.nn.functional.gelu(a_nf4)}")
 
@@ -6,35 +6,47 @@
    * then we apply equalization scale to linear activation with to_weight_tensor_with_linear_activation_scale_metadata (input activation will be divided by equalization_scale), and then call F.linear with
      scaled input activation and quantized weight (so we can reuse the efficient quantized linear kernels used by quantized weight)
 """
-import torch
+
 import copy
 
+import torch
 import torch.nn.functional as F
 from torch import Tensor
+
 from torchao.dtypes import (
-    to_affine_quantized_intx_static,
-    to_affine_quantized_floatx_static,
     Float8Layout,
+    to_affine_quantized_floatx_static,
+    to_affine_quantized_intx_static,
 )
-from torchao.quantization.utils import compute_error
-from torchao.quantization import quantize_
-from torchao.quantization import to_weight_tensor_with_linear_activation_scale_metadata
-from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
-from torchao.quantization.observer import (
-    AffineQuantizedMinMaxObserver,
+from torchao.quantization import (
+    quantize_,
+    to_weight_tensor_with_linear_activation_scale_metadata,
 )
 from torchao.quantization.granularity import (
     PerAxis,
     PerTensor,
 )
+from torchao.quantization.observer import (
+    AffineQuantizedMinMaxObserver,
+)
+from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
 from torchao.quantization.quant_primitives import (
     MappingType,
-    FP8_TYPES,
 )
+from torchao.quantization.utils import compute_error
 
 
 class ObservedLinear(torch.nn.Linear):
-    def __init__(self, in_features: int, out_features: int, act_obs: torch.nn.Module, weight_obs: torch.nn.Module, bias: bool = True, device=None, dtype=None):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        act_obs: torch.nn.Module,
+        weight_obs: torch.nn.Module,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+    ):
         super().__init__(in_features, out_features, bias, device, dtype)
         self.act_obs = act_obs
         self.weight_obs = weight_obs
@@ -46,11 +58,20 @@ def forward(self, input: Tensor):
 
     @classmethod
     def from_float(cls, float_linear, act_obs, weight_obs):
-        observed_linear = cls(float_linear.in_features, float_linear.out_features, act_obs, weight_obs, False, device=float_linear.weight.device, dtype=float_linear.weight.dtype)
+        observed_linear = cls(
+            float_linear.in_features,
+            float_linear.out_features,
+            act_obs,
+            weight_obs,
+            False,
+            device=float_linear.weight.device,
+            dtype=float_linear.weight.dtype,
+        )
         observed_linear.weight = float_linear.weight
         observed_linear.bias = float_linear.bias
         return observed_linear
 
+
 def insert_observers_(model, act_obs, weight_obs):
     _is_linear = lambda m, fqn: isinstance(m, torch.nn.Linear)
 
@@ -61,22 +82,39 @@ def replacement_fn(m):
 
     _replace_with_custom_fn_if_matches_filter(model, replacement_fn, _is_linear)
 
+
 # converting observed linear module to linear module with quantzied weights (and quantized activations)
 # with tensor subclasses
 def apply_awq(target_dtype: torch.dtype):
     # target_dtype = torch.uint8
     def _apply_awq_to_linear(observed_linear):
         # weight quantization
         weight_scale, weight_zero_point = observed_linear.weight_obs.calculate_qparams()
+
         def weight_quant_func(weight):
             block_size = (1, weight.shape[1])
             if target_dtype == torch.uint8:
-                return to_affine_quantized_intx_static(weight, weight_scale, weight_zero_point, block_size, target_dtype)
+                return to_affine_quantized_intx_static(
+                    weight, weight_scale, weight_zero_point, block_size, target_dtype
+                )
             elif target_dtype == torch.float8_e4m3fn:
-                return to_affine_quantized_floatx_static(weight, weight_scale, block_size, target_dtype, Float8Layout(mm_config=None))
+                return to_affine_quantized_floatx_static(
+                    weight,
+                    weight_scale,
+                    block_size,
+                    target_dtype,
+                    Float8Layout(mm_config=None),
+                )
             else:
                 raise ValueError(f"Unsupported target dtype {target_dtype}")
-        linear = torch.nn.Linear(observed_linear.in_features, observed_linear.out_features, False, device=observed_linear.weight.device, dtype=observed_linear.weight.dtype)
+
+        linear = torch.nn.Linear(
+            observed_linear.in_features,
+            observed_linear.out_features,
+            False,
+            device=observed_linear.weight.device,
+            dtype=observed_linear.weight.dtype,
+        )
         linear.weight = observed_linear.weight
         linear.bias = observed_linear.bias
 
@@ -86,16 +124,22 @@ def weight_quant_func(weight):
         equalization_scale, _ = observed_linear.act_obs.calculate_qparams()
         equalization_scale = torch.ones_like(equalization_scale)
 
-        linear.weight = torch.nn.Parameter(weight_quant_func(linear.weight * equalization_scale), requires_grad=False)
+        linear.weight = torch.nn.Parameter(
+            weight_quant_func(linear.weight * equalization_scale), requires_grad=False
+        )
 
-        linear.weight = torch.nn.Parameter(to_weight_tensor_with_linear_activation_scale_metadata(linear.weight, equalization_scale), requires_grad=False)
+        linear.weight = torch.nn.Parameter(
+            to_weight_tensor_with_linear_activation_scale_metadata(
+                linear.weight, equalization_scale
+            ),
+            requires_grad=False,
+        )
 
         return linear
 
     return _apply_awq_to_linear
 
 
-
 ######## Test ##########
 class ToyLinearModel(torch.nn.Module):
     def __init__(self, m=64, n=32, k=64):
@@ -104,7 +148,11 @@ def __init__(self, m=64, n=32, k=64):
         self.linear2 = torch.nn.Linear(k, n, bias=False)
 
     def example_inputs(self, batch_size=1, dtype=torch.float32, device="cpu"):
-        return (torch.randn(batch_size, self.linear1.in_features, dtype=dtype, device=device),)
+        return (
+            torch.randn(
+                batch_size, self.linear1.in_features, dtype=dtype, device=device
+            ),
+        )
 
     def forward(self, x):
         x = self.linear1(x)
@@ -119,16 +167,24 @@ def test_awq(target_dtype: torch.dtype, mapping_type: MappingType):
     dtype = torch.bfloat16
     m = ToyLinearModel().eval().to(dtype).to("cuda")
 
-    m_for_test = copy.deepcopy(m)
-
     m_bf16 = copy.deepcopy(m)
     example_inputs = m.example_inputs(dtype=dtype, device="cuda")
     print("example inputs shape:", example_inputs[0].shape)
 
-    m_bf16 = torch.compile(m_bf16, mode='max-autotune')
-
-    act_obs = AffineQuantizedMinMaxObserver(mapping_type, target_dtype, granularity_type=PerTensor(), eps=torch.finfo(torch.float32).eps)
-    weight_obs = AffineQuantizedMinMaxObserver(mapping_type, target_dtype, granularity_type=PerAxis(axis=0), eps=torch.finfo(torch.float32).eps)
+    m_bf16 = torch.compile(m_bf16, mode="max-autotune")
+
+    act_obs = AffineQuantizedMinMaxObserver(
+        mapping_type,
+        target_dtype,
+        granularity_type=PerTensor(),
+        eps=torch.finfo(torch.float32).eps,
+    )
+    weight_obs = AffineQuantizedMinMaxObserver(
+        mapping_type,
+        target_dtype,
+        granularity_type=PerAxis(axis=0),
+        eps=torch.finfo(torch.float32).eps,
+    )
 
     before_quant = m(*example_inputs)
 
@@ -137,9 +193,9 @@ def test_awq(target_dtype: torch.dtype, mapping_type: MappingType):
     for _ in range(10):
         m(*example_inputs)
 
-    after_obs = m(*example_inputs)
+    m(*example_inputs)
 
-    m2 = copy.deepcopy(m)
+    copy.deepcopy(m)
 
     is_observed_linear = lambda m, fqn: isinstance(m, ObservedLinear)
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@ include = [`
`6`	`6`	`"torchao/*/.py",`
`7`	`7`	`"test/*/.py",`
`8`	`8`	`"benchmarks/*/.py",`
	`9`	`+ "tutorials/*/.py",`
`9`	`10`	`]`
`10`	`11`
`11`	`12`	`exclude = [`