float8 in quantize()

jainapurva · jainapurva · commit 5bdee5eacd9b · 2024-09-09T19:30:19.000-07:00
diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -20,6 +20,8 @@
     fpx_weight_only,
     uintx_weight_only,
     unwrap_tensor_subclass,
+    float8_weight_only,
+    float8_dynamic_activation_float8_weight,
 )
 from torchao._models._eval import TransformerEvalWrapper, InputRecorder
 
@@ -28,6 +30,7 @@
 from torchao.quantization.GPTQ import Int4WeightOnlyGPTQQuantizer
 from torchao._models.llama.model import prepare_inputs_for_model
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+from torchao.quantization.observer import PerTensor, PerRow
 
 def run_evaluation(
     checkpoint_path: Path,
@@ -117,7 +120,19 @@ def run_evaluation(
         else:
             if not TORCH_VERSION_AT_LEAST_2_5:
                 unwrap_tensor_subclass(model)
-
+        if "float8wo" in quantization:
+            quantize_(model, float8_weight_only())
+        if "float8dq" in quantization:
+            granularity = int(quantization.split("-")[-2])
+            if granularity is None:
+                granularity = PerTensor
+            if granularity=="tensor":
+                granularity = PerTensor
+            elif granularity=="row":
+                granularity = PerRow
+            else:
+                raise ValueError(f"float8dq granularity needs to be either tensor or row but got {granularity}")
+            quantize_(model, float8_dynamic_activation_float8_weight(granularity=granularity))
     if compile:
         model = torch.compile(model, mode="max-autotune", fullgraph=True)
     with torch.no_grad():
@@ -140,7 +155,7 @@ def run_evaluation(
     parser.add_argument('--limit', type=int, default=None, help='Number of eval samples to evaluate')
     parser.add_argument('--precision', type=lambda x: getattr(torch, x.split(".")[-1]), default=torch.bfloat16, help='dtype precision to use')
     parser.add_argument('--device', type=str, default="cuda", help='Device to use for evaluation')
-    parser.add_argument("-q", "--quantization", type=str, help="Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, int4wo-<groupsize>-gptq, int4wo-<groupsize>-hqq, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq")
+    parser.add_argument("-q", "--quantization", type=str, help="Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, int4wo-<groupsize>-gptq, int4wo-<groupsize>-hqq, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, float8wo, float8dq-<granularity>")
     parser.add_argument('--compile', action='store_true', help='Whether to compile the model.')
     parser.add_argument('--max_length', type=int, default=None, help='Length of text to process at one time')
     parser.add_argument('--calibration_tasks', type=str, nargs='+', default=['wikitext'], help='tasks to do gptq calibration on, if doing gptq')
diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -210,7 +210,9 @@ def main(
             fpx_weight_only,
             uintx_weight_only,
             autoquant,
-            unwrap_tensor_subclass
+            unwrap_tensor_subclass,
+            float8_weight_only,
+            float8_dynamic_activation_float8_weight,
         )
         if "int8wo" in quantization:
             quantize_(model, int8_weight_only())
@@ -290,6 +292,19 @@ def main(
             dtype = _NBITS_TO_DTYPE[nbits]
             group_size = int(_quant_args[1])
             quantize_(model, uintx_weight_only(dtype, group_size, use_hqq=use_hqq))
+        if "float8wo" in quantization:
+            quantize_(model, float8_weight_only())
+        if "float8dq" in quantization:
+            granularity = int(quantization.split("-")[-2])
+            if granularity is None:
+                granularity = PerTensor
+            if granularity=="tensor":
+                granularity = PerTensor
+            elif granularity=="row":
+                granularity = PerRow
+            else:
+                raise ValueError(f"float8dq granularity needs to be either tensor or row but got {granularity}")
+            quantize_(model, float8_dynamic_activation_float8_weight(granularity=granularity))
         if "autoquant" in quantization:
             if "autoquant-int4" == quantization:
                 model = autoquant(model, manual=True, qtensor_class_list = torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST)
@@ -459,7 +474,7 @@ def callback(x):
     parser.add_argument('--top_k', type=int, default=200, help='Top-k for sampling.')
     parser.add_argument('--temperature', type=float, default=0.8, help='Temperature for sampling.')
     parser.add_argument('--checkpoint_path', type=Path, default=Path("../../../checkpoints/meta-llama/Llama-2-7b-chat-hf/model.pth"), help='Model checkpoint path.')
-    parser.add_argument('-q', '--quantization', type=str, help='Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, autoquant, autoquant-int4, int4wo-<groupsize>-hqq, autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq')
+    parser.add_argument('-q', '--quantization', type=str, help='Which quantization techniques to apply: int8dq, int8wo, int4wo-<groupsize>, autoquant, autoquant-int4, int4wo-<groupsize>-hqq, autoround-<model_device>-<quant_lm_head>-<iters>-<groupsize>-<batch_size>-<seqlen>-<nsamples>, uintx-<nbits>-<groupsize>, uintx-<nbits>-<groupsize>-hqq, float8wo, float8dq-<granularity>')
     parser.add_argument('--kv_cache_quantization', action='store_true', help='Whether to quantize the KV cache')
     parser.add_argument('--cache_size', type=int, default=None, help='Force size of cache to be a certain number of tokens, if not set, will use max_new_tokens+prompt_size')
     parser.add_argument('--linear_causal_mask', action='store_true', help='Whether to use the memory efficient, but slightly less fast, linear causal mask (important for long context lengths)')