intel · harborn · Feb 23, 2024 · Feb 5, 2024 · Feb 5, 2024 · Feb 5, 2024
diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
@@ -2,9 +2,8 @@
 import argparse
 
 
-def update_finetune_config(base_model):
-    conf_file = "finetune/finetune.yaml"
-    with open(conf_file) as f:
+def update_finetune_config(config_file, base_model):
+    with open(config_file) as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
         # due to compute node can't connect network
         # base models are downloaded as local files in directory ~/models/
@@ -23,18 +22,21 @@ def update_finetune_config(base_model):
         # pythia-6.9b
 
         config["General"]["base_model"] = base_model
-        # config["General"]["base_model"] = "pythia-70m"
+        config["General"]["output_dir"] = "./output"
+        config["General"]["checkpoint_dir"] = "./checkpoint"
         config["Training"]["device"] = "GPU"
         config["Training"]["resources_per_worker"]["CPU"] = 1
         config["Training"]["resources_per_worker"]["GPU"] = 1
         config["Training"]["accelerate_mode"] = "GPU_DDP"
+        config["Training"]["logging_steps"] = 1
 
-    with open(conf_file, "w") as f:
+    with open(config_file, "w") as f:
         yaml.dump(config, f, sort_keys=False)
 
 
 def get_parser():
     parser = argparse.ArgumentParser(description="Finetuning on Intel GPU")
+    parser.add_argument("--config_file", type=str, required=True, default=None)
     parser.add_argument("--base_model", type=str, required=True, default=None)
     return parser
 
@@ -43,4 +45,4 @@ def get_parser():
     parser = get_parser()
     args = parser.parse_args()
 
-    update_finetune_config(args.base_model)
+    update_finetune_config(args.config_file, args.base_model)
diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
@@ -8,17 +8,17 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://proxy-chain.intel.com:911'
+        default: 'http://10.24.221.149:911'
       https_proxy:
         type: string
-        default: 'http://proxy-chain.intel.com:911'
+        default: 'http://10.24.221.149:911'
 
 jobs:
   finetune:
     name: finetune on gpu test
     strategy:
       matrix:
-        model: [ pythia-6.9b, gpt-j-6b ]
+        model: [ meta-llama/Llama-2-7b-chat-hf ]
     runs-on: self-hosted
 
     defaults:
@@ -41,6 +41,6 @@ jobs:
           rm ~/borealis-runner/llm-on-ray.tar.gz -f
           tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
           cd ~/borealis-runner/
-          python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
+          python3 finetune_on_pvc.py --need_create_conda_env true --base_models "${{ matrix.model }}"
       - name: Test Summary
-        run: echo "to be continued"
+        run: echo "to be continued"
diff --git a/common/dataset/huggingface_dataset.py b/common/dataset/huggingface_dataset.py
@@ -25,7 +25,7 @@ def __call__(self, config):
             if validation_file is not None:
                 validation_dataset = local_load(validation_file)
                 return datasets.DatasetDict(
-                    {"train": train_dataset, "validation_dataset": validation_dataset}
+                    {"train": train_dataset, "validation": validation_dataset}
                 )
             if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0:
                 datasets_dict = train_dataset.train_test_split(

diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
@@ -155,10 +155,10 @@ def train(self):
         max_train_step = self.config.get("max_train_step")
         max_eval_step = self.config.get("max_eval_step")
         for idx in range(self.starting_epoch, num_train_epochs, 1):
-            logger.info(f"start train epoch {idx}")
             self.model.train()
             start = time.time()
             total_steps = len(self.train_dataloader)
+            logger.info(f"Start training epoch {idx}, total_steps {total_steps}")
             for step, batch in enumerate(self.train_dataloader):
                 with self.accelerator.accumulate(self.model):
                     outputs = self.model(**batch)
@@ -172,13 +172,14 @@ def train(self):
                     if step % logging_steps == 0:
                         loss = loss.item()
                         ppl = math.exp(loss)
+                        epochs = (step + idx * total_steps) / (num_train_epochs * total_steps)
                         logger.info(
-                            f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
+                            f"train epoch:{epochs:.6f}\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
                         )
                         report(
                             {
-                                "loss": loss,
-                                "ppl": ppl,
+                                "train_loss": loss,
+                                "train_ppl": ppl,
                                 "train_epoch": idx,
                                 "total_epochs": num_train_epochs,
                                 "train_step": step,
@@ -187,10 +188,6 @@ def train(self):
                                 else total_steps,
                             }
                         )
-                        self.accelerator.log(
-                            {"train loss": loss, "train perplexity": ppl},
-                            step=idx * total_steps + step,
-                        )
                         start = time.time()
                 if max_train_step is not None:
                     if step >= max_train_step - 1:
@@ -221,9 +218,6 @@ def train(self):
                 except OverflowError:
                     eval_loss = float("inf")
                     perplexity = float("inf")
-                self.accelerator.log(
-                    {"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
-                )
                 logger.info(
                     f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
                 )
@@ -243,8 +237,6 @@ def train(self):
             )
             logger.info(f"finish save model to {output}")
 
-        self.accelerator.end_training()
-
         self.accelerator.wait_for_everyone()
 
     def _get_local_path(self, root_path, model_name):

diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
@@ -10,7 +10,6 @@ The following are the parameters supported in the finetuning workflow.
 |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
 |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
 |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
-|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
 |config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
 |lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
 |deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
@@ -34,7 +33,7 @@ The following are the parameters supported in the finetuning workflow.
 |learning_rate|1e-5|Initial learning rate to use.|
 |lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"|
 |weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.|
-|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set.
+|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16". Default is "no" if not set.
 |device|CPU|The device type used, can be "CPU", "GPU".|
 |num_training_workers|2|The number of the training process.|
 |resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.|

diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data.jsonl
   validation_file: null
@@ -22,9 +23,12 @@ Training:
   learning_rate: 1.0e-05
   lr_scheduler: linear
   weight_decay: 0.0
+  mixed_precision: bf16
   device: GPU
   num_training_workers: 2
   accelerate_mode: GPU_DDP
   resources_per_worker:
     CPU: 1
     GPU: 1
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/finetune.py b/finetune/finetune.py
@@ -2,7 +2,7 @@
 
 import os
 import argparse
-from typing import Any, Dict, Union
+from typing import Any, Dict, Union, Optional
 
 import torch
 import accelerate
@@ -63,12 +63,13 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
     return mode_env_vars[mode]
 
 
-def convert_dtype(dtype: str) -> torch.dtype:
-    supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
-    if dtype in supported_dtypes:
-        return supported_dtypes[dtype]
-    else:
-        raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")
+def convert_dtype(dtype: str) -> Optional[torch.dtype]:
+    supported_dtypes = {
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+        "no": None,
+    }
+    return supported_dtypes[dtype]
 
 
 def train_func(config: Dict[str, Any]):
@@ -89,24 +90,14 @@ def train_func(config: Dict[str, Any]):
     else:
         fsdp_plugin = None
 
-    log_with = "tensorboard"  # only support tensorboard as tracker
     output_dir = config["General"]["output_dir"]
-    tracking_dir = config["General"]["tracking_dir"]
     accelerator = accelerate.Accelerator(
         gradient_accumulation_steps=gradient_accumulation_steps,
         fsdp_plugin=fsdp_plugin,
-        log_with=log_with,
-        project_dir=tracking_dir,
     )
     epochs = config["Training"]["epochs"]
-    tracker_config = {
-        "epochs": epochs,
-        "learning_rate": config["Training"]["learning_rate"],
-        "batch_size": config["Training"]["batch_size"],
-    }
     base_model = config["General"]["base_model"]
     dataset_file = config["Dataset"]["train_file"]
-    accelerator.init_trackers("fine-tuning", config=tracker_config)
 
     common.logger.info(
         f"accelerator generate finish, accelerator device type = {accelerator.device}"
@@ -134,9 +125,11 @@ def train_func(config: Dict[str, Any]):
     model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
         config={
             "name": base_model,
-            "dtype": convert_dtype(config["Training"]["mixed_precision"]),
+            "dtype": convert_dtype(config["Training"].get("mixed_precision", "no")),
             "config": config["General"]["config"],
-            "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
+            "enable_gradient_checkpointing": config["General"].get(
+                "enable_gradient_checkpointing", False
+            ),
             "lora_config": config["General"]["lora_config"]
             if config["General"].get("lora_config")
             else None,

diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
-  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
@@ -30,5 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
-  gradient_accumulation_steps: 2
+  gradient_accumulation_steps: 1
   logging_steps: 10
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
@@ -26,7 +26,6 @@ class General(BaseModel):
     gpt_base_model: bool
     output_dir: str
     checkpoint_dir: str
-    tracking_dir: str
     config: GeneralConfig
     lora_config: Optional[LoraConfig] = None
     deltatuner_config: Optional[DeltatunerConfig] = None
@@ -56,7 +55,7 @@ class Training(BaseModel):
     resources_per_worker: RayResourceConfig
     accelerate_mode: str
     mixed_precision: str = "no"
-    gradient_accumulation_steps: int
+    gradient_accumulation_steps: int = 1
     logging_steps: int = 10
 
     @validator("device")
@@ -73,6 +72,13 @@ def check_accelerate_mode(cls, v: str):
             raise ValueError(f"accelerate_mode must be one of {modes}")
         return v
 
+    @validator("mixed_precision")
+    def check_mixed_precision(cls, v: str):
+        supported_precisions = ["no", "fp16", "bf16"]
+        if v not in supported_precisions:
+            raise ValueError(f"mixed_precision must be one of {supported_precisions}")
+        return v
+
     @validator("logging_steps")
     def check_logging_steps(cls, v: int):
         assert v > 0

diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml
@@ -11,6 +11,10 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+    target_modules:
+    - q_proj
+    - v_proj
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +32,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml
@@ -20,6 +20,7 @@ General:
     - up_proj
     - down_proj
     - lm_head
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -37,3 +38,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10