Update PatchedVLLMKVCache for deepseek performance (#194)

mengniwang95 · linoybu · xinhe3 · commit eeb985bd91be · 2025-04-22T10:34:08.000+03:00
Co-authored-by: Linoy Buchnik &lt;linoybu@gmail.com&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -1071,7 +1071,11 @@ def forward_measure(self, input, cache, *args, **kwargs):
         return output_cache
 
     def fetch_from_cache(self, cache, blocks, permutations=None):
-        quant_cache = self.quant_input(cache)
+        # TODO: Remove this workaround in next release [SW-221595]
+        if cache.dtype != self.lp_dtype:
+            quant_cache = self.quant_input(cache)
+        else:
+            quant_cache = cache
         if permutations:
             output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks, permutations)
             for i in range(len(output_cache)):