Fixing vllm runs for dynamic quantization (#210)

HolyFalafel · web-flow · commit 8bd6ac809e7c · 2025-04-10T15:02:03.000+03:00
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -348,11 +348,7 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         kwargs["func_names"] = ("resolve_input", )
         super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
         # TODO [SW-224403]: Enable dynamic quantization in row parallel allreduce
-        allreduce_quantization_enable = (
-            False
-            if self.is_dynamic_quantization
-            else get_hqt_config(mod).cfg["row_parallel_linear_allreduce_quantization"]
-        )
+        allreduce_quantization_enable = get_hqt_config(mod).cfg["row_parallel_linear_allreduce_quantization"]
         if self.quantization_mode in (QuantMode.MEASURE, QuantMode.SHAPE):
             self.forward = self.forward_measure_reduce if self.reduce_results and self.tp_size > 1 else self.forward_measure_no_reduce