[SW-219751]improve vllm compatible save function (#217)

changwangss · xinhe3 · web-flow · commit 283de793d5b3 · 2025-06-16T14:24:20.000+08:00
* improve vllm compatible save to avoid OOM
---------

Signed-off-by: changwangss &lt;changwang@habana.ai&gt;
Signed-off-by: Xin He &lt;xinhe3@habana.ai&gt;
Co-authored-by: Xin He &lt;xinhe3@habana.ai&gt;
diff --git a/neural_compressor/torch/algorithms/fp8_quant/save_load.py b/neural_compressor/torch/algorithms/fp8_quant/save_load.py
@@ -60,7 +60,6 @@
 
 
 ##################################### save ##################################
-
 def remove_rank_suffix(name, local_rank, world_size):
     """Remove rank suffix from key name."""
     return name.removesuffix(f"_{local_rank}_{world_size}")
@@ -291,54 +290,6 @@ def convert_config_to_vllm_compatible(config):
     }
     return quantization_config
 
-def gather_and_save_model(tp_model, original_model, world_size, max_shard_size, output_dir):
-    """
-    Merge the weights of a DeepSpeed Tensor Parallel model into the original model
-    and save them as sharded safetensors files.
-
-    Args:
-        tp_model: DeepSpeed tensor parallel model.
-        original_model: The original unsharded model.
-        world_size: The model parallelism size.
-        max_shard_size: The maximum size of each shard in bytes.
-        output_dir: The directory to save the files.
-    """
-    import deepspeed
-    merged_state_dict = {}
-
-    for name, tp_param in tp_model.named_parameters():
-        if "scale" not in name:
-            param = original_model.state_dict()[name].t()
-            if len(param.shape) != 0 and tp_param.shape != param.shape:
-                # Perform all-gather to merge tensor parallel parameters
-                tensor_list = [torch.zeros_like(tp_param) for _ in range(world_size)]
-                deepspeed.comm.all_gather(tensor_list, tp_param)
-                if local_rank == 0:
-                    if tp_param.shape[0] != param.shape[0]:
-                        merged_param = torch.cat(tensor_list, dim=0)
-                    elif tp_param.shape[1] != param.shape[1]:
-                        merged_param = torch.cat(tensor_list, dim=1)
-                    logger.info(f"[Rank {local_rank}] Merging parameter: {name}")
-            else:
-                if local_rank == 0:
-                    # No merging needed if shapes match
-                    merged_param = tp_param
-                    logger.info(f"[Rank {local_rank}] No merging needed for parameter: {name}")
-        else:
-            deepspeed.comm.all_reduce(tp_param, op=deepspeed.comm.ReduceOp.MAX)
-            if local_rank == 0:
-                merged_param = tp_param
-                logger.info(f"[Rank {local_rank}] No merging needed for parameter: {name}")
-        # Update the merged state dict
-        if local_rank == 0:
-            merged_state_dict[name] = merged_param
-
-    # Save the merged state dict as sharded safetensors files (only on rank 0)
-    if local_rank == 0:
-        logger.info(f"[Rank {local_rank}] Saving model shards to {output_dir}...")
-        merged_state_dict = convert_weight_to_vllm_compatible(state_dict=merged_state_dict)
-        save_state_dict_sharded_safetensors(merged_state_dict, output_dir, f"{MAX_FILE_SIZE}GB")
-
 
 def check_config_for_vllm_compatible(config):
     """
@@ -362,37 +313,30 @@ def save_for_multi_devices(model, checkpoint_dir="saved_results", format="huggin
         format (str, optional): defaults to 'huggingface'.
     """
     from safetensors.torch import save_file as safe_save_file
-    if format == SaveLoadFormat.VLLM:
-        import transformers
-        from accelerate import init_empty_weights
-        with init_empty_weights(include_buffers=False):
-            reference_model = transformers.AutoModelForCausalLM.from_config(model.config)
-        gather_and_save_model(
-            model, reference_model, world_size=world_size, max_shard_size=f"{MAX_FILE_SIZE}GB", output_dir=checkpoint_dir
+    folder_prefix = os.path.join(options.workspace, checkpoint_dir)
+    save_rank_model(model, folder_prefix=folder_prefix, **kwargs)
+    # Ensure all ranks have saved their model before proceeding
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    rank_directory = add_rank_suffix(folder_prefix, 0, world_size)
+    files_list = find_safetensors_files(rank_directory)
+    # use rank:0 process to gather checkpoint files
+    if local_rank == 0:
+        tp_mod_list = find_tp_mod_list(model)
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        # get the safetensors file list from one folder
+        # based on the safetensors file name to collect tensors from shard folders
+        for file_name in files_list:
+            gathered_state_dict = gather_state_dict(
+                folder_prefix=folder_prefix, file_name=file_name, tp_mod_list=tp_mod_list
             )
-    else:
-        folder_prefix = os.path.join(options.workspace, checkpoint_dir)
-        save_rank_model(model, folder_prefix=folder_prefix, **kwargs)
-        # Ensure all ranks have saved their model before proceeding
-        if torch.distributed.is_initialized():
-            torch.distributed.barrier()
-        rank_directory = add_rank_suffix(folder_prefix, 0, world_size)
-        files_list = find_safetensors_files(rank_directory)
-        # use rank:0 process to gather checkpoint files
-        if local_rank == 0:
-            tp_mod_list = find_tp_mod_list(model)
-            os.makedirs(checkpoint_dir, exist_ok=True)
-            # get the safetensors file list from one folder
-            # based on the safetensors file name to collect tensors from shard folders
-            for file_name in files_list:
-                gathered_state_dict = gather_state_dict(
-                    folder_prefix=folder_prefix, file_name=file_name, tp_mod_list=tp_mod_list
-                )
-                safe_save_file(gathered_state_dict, os.path.join(checkpoint_dir, file_name), metadata={"format": "pt"})
-                clean_rank_files(folder_prefix=folder_prefix, file_name=file_name)
-        if torch.distributed.is_initialized():
-            torch.distributed.barrier()
-        clean_rank_files(folder_prefix=folder_prefix)
+            if format == SaveLoadFormat.VLLM:
+                gathered_state_dict = update_to_vllm_compatible(model, gathered_state_dict)
+            safe_save_file(gathered_state_dict, os.path.join(checkpoint_dir, file_name), metadata={"format": "pt"})
+            clean_rank_files(folder_prefix=folder_prefix, file_name=file_name)
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+    clean_rank_files(folder_prefix=folder_prefix)
 
 
 def save_for_single_device(model, checkpoint_dir="saved_results", format="huggingface", **kwargs):
@@ -558,6 +502,9 @@ def shard_state_dict(state_dict, return_all_rank=False, src_world_size=None):
     def get_rank_state_dict(state_dict, local_rank, world_size):
         rank_state_dict = {}
         for k, v in state_dict.items():
+            if k.endswith(f"_{world_size}") and not k.endswith(f"_{local_rank}_{world_size}"):
+                # only collect current rank state_dict and common state_dict (e.g., embedding)
+                continue
             new_k = remove_rank_suffix(k, local_rank, world_size)
             rank_state_dict[new_k] = v
         return rank_state_dict
@@ -621,7 +568,11 @@ def get_inc_fp8config(model, from_neuralmagic=False, from_neuralmagic_with_kv=Fa
                 allowlist = {"types": ["Linear", "LinearLayer", "LinearAllreduce"], "names": []}
         qconfig = FP8Config(mode="LOAD", allowlist=allowlist, blocklist=blocklist, scale_format="CONST")
     else:
-        qconfig = FP8Config.from_dict(model.config.quantization_config)
+        if hasattr(model, "qconfig") and model.qconfig is not None:
+            configs_mapping = model.qconfig
+            qconfig = configs_mapping[next(iter(configs_mapping))]
+        else:
+            qconfig = FP8Config.from_dict(model.config.quantization_config)
     return qconfig
 
 
@@ -678,7 +629,7 @@ def load(model_name_or_path, format="huggingface", device="hpu", **kwargs):
                         raise EnvironmentError(
                             f"Original world_size: {src_world_size} must be divisible by target world_size: {world_size}."
                         )
-                    rank_state_dict = get_new_rank_state_dict(rank_state_dict, world_size, model=model)
+                    rank_state_dict = get_new_rank_state_dict(rank_state_dict, model, world_size)
         else:
             rank_state_dict = gathered_state_dict
         model.load_state_dict(rank_state_dict, assign=True, strict=False)
@@ -816,8 +767,16 @@ def load_scale_params(model, new_scale_params):
                 param.data = new_scale
 
 
-def get_new_rank_state_dict(all_rank_state_dict, world_size=world_size, local_rank=local_rank, model=None):
-    """Get new rank state_dict for world_size."""
+def get_new_rank_state_dict(all_rank_state_dict, model, world_size=world_size, local_rank=local_rank):
+    """Get new rank state_dict for world_size.
+
+    Args:
+        all_rank_state_dict (dict): {0: state_dict, 1: state_dict, ...} for all ranks.
+        model (torch.nn.Module): A quantized model with empty weights in load mode.
+        world_size (int, optional): Target world size. Defaults to world_size.
+        local_rank (int, optional): Current local rank. Defaults to local_rank.
+    """
+
     def dq_q_weight(weight, previous_scale, target_scale):
         """dequantize and quantize weight with different scales."""
         cast_to_op = get_quantized_func_wrapper(OP_TYPE.CAST_TO_FP8, ScaleFormat.CONST)
@@ -936,3 +895,34 @@ def update_corresponding_weight(previous_state_dict, cur_state_dict, target_weig
                     tp_dim = 0 if params_dict[k].shape[0] != new_rank_state_dict[k].shape[0] else 1
                     new_rank_state_dict[k] = torch.cat([new_rank_state_dict[k], v], dim=tp_dim)
     return new_rank_state_dict
+
+
+def update_to_vllm_compatible(model, gathered_state_dict):
+    """update gathered_state_dict to vllm compatible format.
+
+    Args:
+        model (model): the quantized model.
+        gathered_state_dict (dict): state_dict for all ranks.
+    Returns:
+        gathered_state_dict (dict): vllm compatible state_dict.
+    """
+    # for example, tp = 2, make
+    # 'model.layers.21.mlp.down_proj.weight_0_2', 'model.layers.21.mlp.down_proj.weight_1_2'
+    #  --> 'model.layers.21.mlp.down_proj.weight'
+    import transformers
+    from accelerate import init_empty_weights
+    from neural_compressor.torch.algorithms.fp8_quant import prep_model
+
+    with init_empty_weights(include_buffers=False):
+        reference_model = transformers.AutoModelForCausalLM.from_config(model.config)
+    # replace modules with patched modules
+    inc_config = get_inc_fp8config(model)
+    inc_config.mode = "LOAD"
+    inc_config.save_temp_json_file()
+    prep_model(reference_model, inc_config.json_file)
+    # gather weights into 1 rank
+    rank_state_dict = shard_state_dict(gathered_state_dict, return_all_rank=True, src_world_size=world_size)
+    rank_state_dict = get_new_rank_state_dict(rank_state_dict, reference_model, world_size=1)
+    # rename param names
+    gathered_state_dict = convert_weight_to_vllm_compatible(state_dict=rank_state_dict)
+    return gathered_state_dict
diff --git a/test/3x/torch/quantization/fp8_quant/test_save_load.py b/test/3x/torch/quantization/fp8_quant/test_save_load.py
@@ -33,7 +33,7 @@ def compare_parameters_buffers(model1, model2, atol=1e-8):
     for k, v in dict1.items():
         assert k in dict2, "k not in dict2"
         assert v.dtype == dict2[k].dtype, f"dtype of {k} is differnt.\n{v.dtype}\n{dict2[k].dtype}"
-        assert torch.allclose(v, dict2[k], atol=atol), f"{k} is differnt in model1 and model2.\n" + f"{v}\n" + f"{dict2[k]}\n"
+        assert torch.allclose(v.float(), dict2[k].float(), atol=atol), f"{k} is differnt in model1 and model2.\n" + f"{v}\n" + f"{dict2[k]}\n"
 
 
 @torch.no_grad()
@@ -72,6 +72,8 @@ def test_save_vllm_compatible_model():
     generation_config.save_pretrained("saved_results_qwen")
     tokenizer = transformers.AutoTokenizer.from_pretrained(name)
     tokenizer.save_pretrained("saved_results_qwen")
+    shutil.rmtree("saved_results_qwen", ignore_errors=True)
+    shutil.rmtree("nc_workspace", ignore_errors=True)
 
 @pytest.mark.skip(reason="[SW-226589] Skip this test since the model was updated")
 def test_load_model_provided_by_neuralmagic():