[SW-224874] Implement support for hp/lp dtypes in KV-cache QDQ (#222)

Tiefen-boop · web-flow · commit dbb74ce96af0 · 2025-05-05T13:39:13.000+03:00
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py b/neural_compressor/torch/algorithms/fp8_quant/_core/scale_methods/ops_quantizer.py
@@ -333,6 +333,16 @@ def __init__(self, config, mod, measurement, params, module_type):
         self.inputs_scales_creators.append(self.scales_method_factory.get_scale_method(QuantTensorName.INPUT))
         self.output_scales_creators.append(self.inputs_scales_creators[0])
 
+    # TODO: Remove after implementing lp_dtype in OHF.
+    def init_input_config(self, scales_inv, lp_dtype, hp_dtype, scale_format, use_qdq, fake_quant):
+        input_config = super().init_input_config(scales_inv, lp_dtype, hp_dtype, scale_format, False, fake_quant)
+        if use_qdq:
+            input_config.extend([
+                QuantDequant(s_inv, lp_dtype, hp_dtype, scale_format=scale_format, use_qdq=use_qdq)
+                for s_inv in scales_inv
+            ])
+        return input_config
+
     def get_scales_module_config(self):
         input_scales = self.calc_input_scales(num_of_inputs=1)
         self.output_scales_creators[0].scale = self.inputs_scales_creators[0].scale
@@ -345,11 +355,13 @@ def scales_module_config_to_q_and_dq(self, module):
         input_scales_inv = [
             self.inputs_scales_creators[i].calc_invert_scales() for i in range(len(self.inputs_scales_creators))
         ]
-        input_config = super().init_input_config(
+        # TODO: After implementing lp_dtype in OHF can call:
+        # `super().init_input_config(scales_inv, lp_dtype, hp_dtype, scale_format, False, fake_quant)`
+        input_config = self.init_input_config(
             input_scales_inv, lp_dtype, hp_dtype, scale_format, use_qdq, fake_quant
         )
         output_config = [
-            DequantOutput(self.output_scales_creators[0].scale, lp_dtype, hp_dtype, scale_format=scale_format)
+            DequantOutput(self.output_scales_creators[0].scale, lp_dtype, hp_dtype, scale_format=scale_format, use_qdq=False)
         ]
         return ModuleConfig(input_config, output_config)
 
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -869,6 +869,7 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
             self.quant_input = self._mod_extra_config.inputs[0]
             self.dequant_output = self._mod_extra_config.outputs[0]
             if self.use_qdq:
+                self.qdq_input = self._mod_extra_config.inputs[1]
                 self.update = self.update_qdq
                 mod.update = self.update_qdq
             else:
@@ -885,8 +886,23 @@ def allocate(self, inp_seq_len, dtype, device, shape):
 
     # overwrite update function of original module to force quant and dequant of cache input and output
     def update_qdq(self, prev, cur, dim, idx, inp_seq_len):
-        qinput = self.quant_input(cur)
-        output = self.org_update(prev, qinput, dim, idx, inp_seq_len)
+        """
+         Explanation:  If we want to optimize index_copy so it would run in fp8 instead of bf16
+                       we need the tensors to be in fp8 before calling index_copy.
+                       Also the `prev` and `curr` tensors need to be of the same dtype - and quanting them both
+                       from bf16 is no help, best we can do is have prev be initialized an fp8 tensor from the start.
+                       Since the initilization of `prev` is done in OHF (and that is not implemented yet) we
+                       currently need to support both options until the implementation in OHF is done, then
+                       can we remove the support for the bf16 `prev` option (the else here). 
+        """
+        if prev.dtype == torch.float8_e4m3fn:
+            qcurr = self.quant_input(cur)
+            qoutput = self.org_update(prev, qcurr, dim, idx, inp_seq_len)
+            output = self.dequant_output(qoutput)
+        # TODO: remove the `else` part once the lp_dtype is implemented in OHF
+        else:
+            curr = self.qdq_input(cur)
+            output = self.org_update(prev, curr, dim, idx, inp_seq_len)
         return output
 
     # overwrite update function of original module to force quant and dequant of cache input and output