[SW-230951] Save measurements according to samples counter (#251)

ulivne · web-flow · commit 83a74ccd5372 · 2025-06-30T12:40:12.000+03:00
* Added post forward hook to dump measurements according to samples counter 
* add support in samples counter in config
* removed function in RowParllelLinear as it is removed from the vllm upstream code
* currently only blocking method is operational, will complete async methods in future commit



* fix CR comments

* remove unused files

* add reslove_input method

it can't be defined in vllm due to upstream considerations,
so it is copied here

* fixed logging acoording to cr

* fixed resolve_input and moved the hook function
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/common.py b/neural_compressor/torch/algorithms/fp8_quant/_core/common.py
@@ -74,7 +74,7 @@ def load_npz(fname):
     return d["arr_0"].item()
 
 
-def save_file(model, d, source_format, fname, mode):
+def save_file(model, d, source_format, fname, mode, num_samples=0):
     from .._quant_common.quant_config import get_hqt_config
     config = get_hqt_config(model)
     logger.debug("Saving %s file: %s", mode, fname)
@@ -87,6 +87,8 @@ def save_file(model, d, source_format, fname, mode):
         "Mode": mode,
         "Nodes": dc,
     }
+    if num_samples > 0:
+        df = { "NumSamples": num_samples, **df}
     try:
         file_functions[ext]['save'](df, fname)
     except:
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py b/neural_compressor/torch/algorithms/fp8_quant/_core/measure.py
@@ -12,15 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import json
-import os
-
 import numpy as np
 import torch
 
 from abc import abstractmethod
 
 from .._quant_common.quant_config import MeasureExclude, QuantMode, get_hqt_config, set_hqt_config
+from .save_measure import gmod_list
 from .scale_methods.scale_method_config import ScaleMethodString
 from ..utils.logger import logger
 from .common import load_file, save_file, ShapeList
@@ -33,10 +31,9 @@
     IMOD_DICT,
 )
 from neural_compressor.torch.algorithms.fp8_quant._core.common import dequant_original_fp8_weight_if_needed
-cur_accelerator = auto_detect_accelerator()
 
 
-gmod_list = []
+cur_accelerator = auto_detect_accelerator()
 
 
 def patch_module_measure(mod, mconfig, mod_dict):
@@ -115,6 +112,12 @@ def prepare_model(model, mod_list=None):
     generate_model_info(model)
     register_patched_measure_modules(model, mod_list, observer_class, d_shapes)
 
+def setup_calibration_counter(model, config):
+    # used for automatically dumping measurements
+    calibration_sample_interval = int(config["calibration_sample_interval"])
+    if calibration_sample_interval > 0:
+        from .save_measure import add_calibration_samples_counter
+        add_calibration_samples_counter(model, calibration_sample_interval)
 
 def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=None):
     """Replace the submodules of the model that appear in mod_list with a patched submodule that uses the given observer_class
@@ -129,6 +132,7 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
     """
     top_level_config = get_hqt_config(model)
     config = top_level_config.cfg
+    setup_calibration_counter(model, config)
     skip_outputs_measurements = config["measure_exclude"] & (MeasureExclude.OUTPUT | MeasureExclude.ALL)
     patched_types = set()
     non_patched_types = set()
@@ -187,94 +191,6 @@ def register_patched_measure_modules(model, mod_list, observer_class, d_shapes=N
     cur_accelerator.synchronize()
 
 
-def is_measure_done(mod_extra_config):
-    # check if measurements were collected by observer
-    for obs in ([] if mod_extra_config.inputs is None else mod_extra_config.inputs) + (
-        [] if mod_extra_config.outputs is None else mod_extra_config.outputs
-    ):
-        if obs.is_used():
-            return True
-    return False
-
-
-def get_mod_extra_config_dict(model):
-    mcd = {}
-    for name, mod in model.named_modules():
-        if hasattr(mod, "_mod_extra_config") and mod._mod_extra_config:
-            if is_measure_done(mod._mod_extra_config):
-                name = name.replace("_orig_mod.", "")  # remove _orig_mod part added by dynamo mechanism
-                mcd[name] = mod._mod_extra_config
-            else:
-                logger.debug(
-                    "Layer '%s' has no measurements therefore it can't be quantized during quantization.",
-                    name,
-                )
-    return mcd
-
-
-def measure_control_to_state_dict(mcd):
-    sd = {}
-    sdl = {}
-    for mname in mcd:
-        sd[mname] = dict()
-        sdl[mname] = dict()
-        sd[mname]["inputs"] = [
-            mcd[mname].inputs[i].state.detach().cpu().float().numpy()
-            for i in range(len(mcd[mname].inputs))
-            if mcd[mname].inputs[i].state is not None
-        ]
-        sdl[mname]["inputs"] = [
-            mcd[mname].inputs[i].state.detach().cpu().float().numpy().tolist()
-            for i in range(len(mcd[mname].inputs))
-            if mcd[mname].inputs[i].state is not None
-        ]
-        if mcd[mname].outputs:
-            sd[mname]["outputs"] = [
-                mcd[mname].outputs[i].state.detach().cpu().float().numpy()
-                for i in range(len(mcd[mname].outputs))
-                if mcd[mname].outputs[i].state is not None
-            ]
-            sdl[mname]["outputs"] = [
-                mcd[mname].outputs[i].state.detach().cpu().float().numpy().tolist()
-                for i in range(len(mcd[mname].outputs))
-                if mcd[mname].outputs[i].state is not None
-            ]
-        if len(mcd[mname].params) > 0:
-            sd[mname]["params"] = dict()
-            sdl[mname]["params"] = dict()
-            for param_name in mcd[mname].params:
-                if mcd[mname].params[param_name].state is not None:
-                    sd[mname]["params"][param_name] = mcd[mname].params[param_name].state.detach().cpu().float().numpy()
-                    sdl[mname]["params"][param_name] = (
-                        mcd[mname].params[param_name].state.detach().cpu().float().numpy().tolist()
-                    )
-    return sd, sdl
-
-
-def save_measurements(model, fname=None):
-    config = get_hqt_config(model).cfg
-    if config["mode"] in [QuantMode.MEASURE, QuantMode.SHAPE]:
-        if fname is None:
-            if ("measure_file" in config) and (config["measure_file"] is not None):
-                fname_base = config["measure_file"]
-                measure_type = "DynamicRange"
-            elif ("shape_file" in config) and (config["shape_file"] is not None) and (config["observer"] == "shape"):
-                fname_base = config["shape_file"]
-                measure_type = "Shape"
-            fname_np = fname_base + ".npz"
-            fname_list = fname_base + ".json"
-        else:
-            logger.warning("'fname' is not None - Measurements/Shapes will not be saved")
-            return
-        mcd = get_mod_extra_config_dict(model)
-        sd, sdl = measure_control_to_state_dict(mcd)
-
-        logger.info("Dumping measurements")
-        save_file(model, sd, np.ndarray, fname_np, measure_type)
-        save_file(model, sdl, list, fname_list, measure_type)
-        save_json(gmod_list, fname_base + "_mod_list.json")
-
-
 def load_measurements(model, fname):
     config = get_hqt_config(model).cfg
     source_fname = fname if fname is not None else config["measure_file"]
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/__init__.py b/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .save_files import save_measurements, gmod_list
+from .hook_logic import add_calibration_samples_counter
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/hook_logic.py b/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/hook_logic.py
@@ -0,0 +1,62 @@
+import asyncio
+import os
+from threading import Thread
+
+from neural_compressor.torch.algorithms.fp8_quant.utils.logger import logger
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import get_hqt_config
+from .save_files import (
+    create_files_names, measure_control_to_state_dict, save_measurements_files, save_measurements,
+    get_mod_extra_config_dict, gmod_list )
+
+
+def dump_direct_call(model):
+    save_measurements(model)
+
+def dump_threading(model):
+    t = Thread(target=save_measurements, args=(model,), daemon=True)
+    t.start()
+
+def dump_async_call(model):
+    asyncio.run(dump_async_call_inner(model))
+
+async def dump_async_call_inner(model):
+    mcd = get_mod_extra_config_dict(model)
+    await save_measurements_async_wrapper(model, mcd)
+
+async def save_measurements_async_wrapper(model, mcd):
+    config = get_hqt_config(model).cfg
+    fname_base, fname_np, fname_list, measure_type = create_files_names(config)
+    sd, sdl = measure_control_to_state_dict(mcd)
+    save_measurements_files(model, sd, sdl, gmod_list, fname_np, fname_list, fname_base, measure_type)
+
+
+def dump_shelv(model):
+    pass
+
+_measurement_dump_method = os.getenv("MEASUREMENT_DUMP_METHOD", "1")
+_measurement_dump_method_dict = {
+    "1": dump_direct_call,
+    # below methods shouldn't be currently used as they are not fully completed
+    "2": dump_threading,
+    "3": dump_async_call,
+    "5": dump_shelv
+}
+
+
+def _increment_calibration_samples_counter(model, *args):  # post hook function
+    model.calibration_samples_counter += 1
+    if model.calibration_samples_counter % model.calibration_sample_interval == 0:
+        logger.debug("Reached sampling interval limit: %d, total samples: %d, dumping measurements.",
+                     model.calibration_sample_interval, model.calibration_samples_counter)
+        _measurement_dump_method_dict[_measurement_dump_method](model)
+        logger.debug("finished dumping measurements.")
+
+def add_calibration_samples_counter(model_to_calibrate, calibration_sample_interval):
+    """
+    Adds a forward post-hook to the model that counts the number of calibration samples processed.
+    When the maximum number of samples is reached, it saves the measurements.
+    """
+    model_to_calibrate.calibration_samples_counter = 0
+    model_to_calibrate.calibration_sample_interval = calibration_sample_interval
+    model_to_calibrate.register_forward_hook(_increment_calibration_samples_counter)
+    logger.info("Calibration samples interval added to the model - %d.", calibration_sample_interval)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/save_file_proccess.py b/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/save_file_proccess.py
@@ -0,0 +1,7 @@
+import sys
+
+from ..measure import save_measurements
+
+if __name__ == "__main__":
+    model = sys.argv[0]
+    save_measurements(model)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/save_files.py b/neural_compressor/torch/algorithms/fp8_quant/_core/save_measure/save_files.py
@@ -0,0 +1,102 @@
+from neural_compressor.torch.algorithms.fp8_quant.utils.logger import logger
+from neural_compressor.torch.algorithms.fp8_quant._quant_common.quant_config import get_hqt_config, QuantMode
+from neural_compressor.torch.algorithms.fp8_quant._core.common import save_file, save_json
+
+
+def is_measure_done(mod_extra_config):
+    # check if measurements were collected by observer
+    for obs in ([] if mod_extra_config.inputs is None else mod_extra_config.inputs) + (
+        [] if mod_extra_config.outputs is None else mod_extra_config.outputs
+    ):
+        if obs.is_used():
+            return True
+    return False
+
+def get_mod_extra_config_dict(model):
+    mcd = {}
+    for name, mod in model.named_modules():
+        if hasattr(mod, "_mod_extra_config") and mod._mod_extra_config:
+            if is_measure_done(mod._mod_extra_config):
+                name = name.replace("_orig_mod.", "")  # remove _orig_mod part added by dynamo mechanism
+                mcd[name] = mod._mod_extra_config
+            else:
+                logger.debug(
+                    "Layer '%s' has no measurements therefore it can't be quantized during quantization.",
+                    name,
+                )
+    return mcd
+
+def measure_control_to_state_dict(mcd):
+    sd = {}
+    sdl = {}
+    for mname in mcd:
+        sd[mname] = dict()
+        sdl[mname] = dict()
+        sd[mname]["inputs"] = [
+            mcd[mname].inputs[i].state.detach().cpu().float().numpy()
+            for i in range(len(mcd[mname].inputs))
+            if mcd[mname].inputs[i].state is not None
+        ]
+        sdl[mname]["inputs"] = [
+            mcd[mname].inputs[i].state.detach().cpu().float().numpy().tolist()
+            for i in range(len(mcd[mname].inputs))
+            if mcd[mname].inputs[i].state is not None
+        ]
+        if mcd[mname].outputs:
+            sd[mname]["outputs"] = [
+                mcd[mname].outputs[i].state.detach().cpu().float().numpy()
+                for i in range(len(mcd[mname].outputs))
+                if mcd[mname].outputs[i].state is not None
+            ]
+            sdl[mname]["outputs"] = [
+                mcd[mname].outputs[i].state.detach().cpu().float().numpy().tolist()
+                for i in range(len(mcd[mname].outputs))
+                if mcd[mname].outputs[i].state is not None
+            ]
+        if len(mcd[mname].params) > 0:
+            sd[mname]["params"] = dict()
+            sdl[mname]["params"] = dict()
+            for param_name in mcd[mname].params:
+                if mcd[mname].params[param_name].state is not None:
+                    sd[mname]["params"][param_name] = mcd[mname].params[param_name].state.detach().cpu().float().numpy()
+                    sdl[mname]["params"][param_name] = (
+                        mcd[mname].params[param_name].state.detach().cpu().float().numpy().tolist()
+                    )
+    return sd, sdl
+
+def create_files_names(config, fname = None):
+    if fname is None:
+        if ("measure_file" in config) and (config["measure_file"] is not None):
+            fname_base = config["measure_file"]
+            measure_type = "DynamicRange"
+        elif ("shape_file" in config) and (config["shape_file"] is not None) and (config["observer"] == "shape"):
+            fname_base = config["shape_file"]
+            measure_type = "Shape"
+        fname_np = fname_base + ".npz"
+        fname_list = fname_base + ".json"
+        return fname_base, fname_np, fname_list, measure_type
+    else:
+        logger.warning("'fname' is not None - Measurements/Shapes will not be saved")
+        return
+
+def save_measurements_files(model, state_dict, state_list, gmod_list, fname_np, fname_list, fname_base, measure_type,
+                            num_samples=0):
+    import numpy as np
+    logger.info("Dumping measurements")
+    save_file(model, state_dict, np.ndarray, fname_np, measure_type, num_samples)
+    save_file(model, state_list, list, fname_list, measure_type, num_samples)
+    save_json(gmod_list, fname_base + "_mod_list.json")
+    return
+
+
+gmod_list = [] # global list extened with patched modules in measure.prepare_model
+
+
+def save_measurements(model, fname=None):
+    config = get_hqt_config(model).cfg
+    if config["mode"] in [QuantMode.MEASURE, QuantMode.SHAPE]:
+        fname_base, fname_np, fname_list, measure_type = create_files_names(config, fname)
+        mcd = get_mod_extra_config_dict(model)
+        sd, sdl = measure_control_to_state_dict(mcd)
+        num_samples = model.calibration_samples_counter if hasattr(model, "calibration_samples_counter") else 0
+        save_measurements_files(model, sd, sdl, gmod_list, fname_np, fname_list, fname_base, measure_type, num_samples)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/helper_modules.py
@@ -347,7 +347,6 @@ def post_all_reduce(self, input):
 
 class PatchedRowParallelLinear(PatchedLinearBase):
     def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
-        kwargs["func_names"] = ("resolve_input", )
         super().__init__(mod, parent, mod_extra_config, *args, **kwargs)
         from .._core.vllm_functions import get_vllm_row_parallel_collective_func
         self.row_parallel_collective_func = get_vllm_row_parallel_collective_func()
@@ -377,6 +376,19 @@ def __init__(self, mod, parent, mod_extra_config, *args, **kwargs):
         from torch import distributed as dist
         self.world_size = dist.get_world_size()
 
+    def resolve_input(self, input_):
+        """
+        this code is copied from vllm RowParallelLinear forward method
+        """
+        if self.input_is_parallel:
+            input_parallel = input_
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size)
+            input_parallel = splitted_input[tp_rank].contiguous()
+        return input_parallel
+
     def forward_qdq(self, input):
         # TODO: [SW-208441] Support all_reduce_fp8 in forward_qdq in PatchedRowParallelLinear
         resolved_input = self.resolve_input(input)
diff --git a/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py b/neural_compressor/torch/algorithms/fp8_quant/_quant_common/quant_config.py
@@ -184,7 +184,8 @@ def parse(custom_config: Mapping[str, str]) -> Fp8cfg:
             "scale_format": ScaleFormat.SCALAR,
             "measure_on_hpu": True,  # Determines whether to measure model on hpu device.
             "row_parallel_linear_allreduce_quantization" : False, # Turn on/off fp8 allreduce optimization detailed in SW-207602
-            "dynamic_quantization" : False # Turn on/off fp8 dynamic quantization
+            "dynamic_quantization" : False, # Turn on/off fp8 dynamic quantization
+            "calibration_sample_interval" : 0 # number of samples to process before dumping measurements, 0 means no automatic dumping
         }
         # go over all user-defined keys from json, handle various cases
         for keys in custom_config:
diff --git a/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py b/neural_compressor/torch/algorithms/fp8_quant/prepare_quant/prepare_model.py
diff --git a/test/3x/torch/algorithms/fp8_quant/conftest.py b/test/3x/torch/algorithms/fp8_quant/conftest.py
diff --git a/test/3x/torch/algorithms/fp8_quant/unit_tests/test_calibration_counter.py b/test/3x/torch/algorithms/fp8_quant/unit_tests/test_calibration_counter.py