[SW-227433] Revert "Update PatchedVLLMKVCache for deepseek performance (#194)" (#231)

mengniwang95 · web-flow · commit f94ef0c44dd1 · 2025-05-20T09:48:01.000+03:00
remove WA
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -965,11 +965,7 @@ def forward_measure(self, input, cache, *args, **kwargs):
         return output_cache
 
     def fetch_from_cache(self, cache, blocks):
-        # TODO: Remove this workaround in next release [SW-221595]
-        if cache.dtype != self.lp_dtype:
-            quant_cache = self.quant_input(cache)
-        else:
-            quant_cache = cache
+        quant_cache = self.quant_input(cache)
         output_cache = self.orig_mod.fetch_from_cache(quant_cache, blocks)
         return self.dequant_output(output_cache)