Add floating point options for autoquant and add accuracy measurement (#1355)

jerryzh168 · web-flow · commit 543209b0592b · 2024-11-26T19:54:33.000-08:00
* Add floating point options for autoquant and add accuracy measurement Summary: * This PR adds float32/float16/bfloat16 as a list of options for autoquant, it converts input/weight/bias/output to the specified dtype * Also adds min_sqnr (https://en.wikipedia.org/wiki/Signal-to-quantization-noise_ratio) to allow users to filter out the quantization methods that has large numerical impact compared to original output Note that we use random generated input activation right now, we can improve this by adding the support for using real inputs Test Plan: python generate.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --compile --compile_prefill --quantization autoquant-fp Reviewers: Subscribers: Tasks: Tags: * update docstring * fix * ruff * skip if no cuda
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
@@ -371,6 +371,7 @@ def main(checkpoint_path,
          baseline=False,
          fast=False,
          furious=False,
+         use_autoquant=False,
          unittest=False,
          benchmark=False,
          profile=None,
@@ -399,13 +400,13 @@ def main(checkpoint_path,
         from torchao._models.sam2.build_sam import build_sam2
         from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
         from torchao._models.sam2.utils.amg import rle_to_mask
-    
+
     device = "cuda"
     sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
-    
+
     logging.info(f"Loading model {sam2_checkpoint} with config {model_cfg}")
     sam2 = build_sam2(model_cfg, sam2_checkpoint, device=device, apply_postprocessing=False)
- 
+
     logging.info(f"Using {points_per_batch} points_per_batch")
     mask_generator = SAM2AutomaticMaskGenerator(sam2, points_per_batch=points_per_batch, output_mode="uncompressed_rle")
 
@@ -416,6 +417,18 @@ def main(checkpoint_path,
     if furious:
         set_furious(mask_generator)
 
+    # since autoquant is replicating what furious mode is doing, don't use these two together
+    elif use_autoquant:
+        from torchao import autoquant
+        from torchao.quantization import DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST
+        mask_generator.predictor.model = autoquant(mask_generator.predictor.model, qtensor_class_list=DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, min_sqnr=40)
+
+        mask_generator.predictor.model.image_encoder = mask_generator.predictor.model.image_encoder.to(torch.float16, min_sqnr=40)
+        # NOTE: Not baseline feature
+        mask_generator.predictor._transforms_device = mask_generator.predictor.device
+        torch.set_float32_matmul_precision('high')
+
+
     with open('dog.jpg', 'rb') as f:
         image_tensor = file_bytes_to_image_tensor(bytearray(f.read()))
 
@@ -494,7 +507,7 @@ async def upload_rle(image: UploadFile = File(...)):
         await request_queue.put((image_tensor, response_future))
         masks = await response_future
         return masks_to_rle_dict(masks)
-    
+
     @app.post("/upload")
     async def upload_image(image: UploadFile = File(...)):
         image_tensor = file_bytes_to_image_tensor(bytearray(await image.read()))
@@ -512,7 +525,7 @@ async def upload_image(image: UploadFile = File(...)):
         plt.savefig(buf, format='png')
         buf.seek(0)
         return StreamingResponse(buf, media_type="image/png")
-    
+
 
     # uvicorn.run(app, host=host, port=port, log_level="info")
     uvicorn.run(app, host=host, port=port)
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -1514,6 +1514,23 @@ def forward(self, x):
         assert not isinstance(model.lin1.weight.weight, AutoQuantizableLinearWeight)
         model(x_in)
 
+    @parameterized.expand(list(itertools.product(["cuda"], COMMON_DTYPES)))
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    def test_autoquant_min_sqnr(self, device, dtype):
+        m, k, n = 128, 128, 128
+        example_input = torch.randn(m, k, device=device, dtype=dtype)
+        model = torch.nn.Sequential(
+            torch.nn.ReLU(),
+            torch.nn.Linear(k,n),
+            torch.nn.ReLU(),
+        ).to(device).to(dtype)
+        out = model(example_input)
+        torchao.autoquant(model, min_sqnr=60)
+        out2 = model(example_input)
+        sqnr = SQNR(out, out2)
+        # without setting min_sqnr to 60, we get around 45-50 final sqnr
+        # setting min_sqnr for individual linear to be 60 allows us to achieve >= 50 final sqnr
+        self.assertTrue(sqnr >= 50, f"sqnr: {sqnr}")
 
 
 
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -402,6 +402,8 @@ def main(
                 model = autoquant(model, manual=True, qtensor_class_list = torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST, example_input=inputs)
             elif "autoquant-float8" == quantization:
                 model = autoquant(model, manual=True, qtensor_class_list = torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST, example_input=inputs)
+            if "autoquant-fp" == quantization:
+                model = autoquant(model, manual=True, qtensor_class_list = torchao.quantization.DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST, example_input=inputs)
             else:
                 model = autoquant(model, manual=True, example_input=inputs)
 
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -11,6 +11,7 @@
 
 from .autoquant import (
     DEFAULT_AUTOQUANT_CLASS_LIST,
+    DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST,
     DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
     OTHER_AUTOQUANT_CLASS_LIST,
     autoquant,
@@ -89,6 +90,7 @@
     "autoquant",
     "DEFAULT_AUTOQUANT_CLASS_LIST",
     "DEFAULT_INT4_AUTOQUANT_CLASS_LIST",
+    "DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST",
     "OTHER_AUTOQUANT_CLASS_LIST",
     # top level API - manual
     "quantize_",
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py