From db8ffc7178390983fa43276d152eccc44f06840b Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Mon, 5 Feb 2024 16:32:06 +0000
Subject: [PATCH 01/10] fixes

---
 common/dataset/huggingface_dataset.py |  2 +-
 common/trainer/default_trainer.py     |  5 +++--
 finetune/finetune.py                  | 16 +++++++++-------
 finetune/finetune.yaml                |  2 +-
 finetune/finetune_config.py           |  9 ++++++++-
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/common/dataset/huggingface_dataset.py b/common/dataset/huggingface_dataset.py
index 9173e067f..3b9214aaf 100644
--- a/common/dataset/huggingface_dataset.py
+++ b/common/dataset/huggingface_dataset.py
@@ -25,7 +25,7 @@ def __call__(self, config):
             if validation_file is not None:
                 validation_dataset = local_load(validation_file)
                 return datasets.DatasetDict(
-                    {"train": train_dataset, "validation_dataset": validation_dataset}
+                    {"train": train_dataset, "validation": validation_dataset}
                 )
             if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0:
                 datasets_dict = train_dataset.train_test_split(
diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
index f05c6317f..7d76aafa6 100644
--- a/common/trainer/default_trainer.py
+++ b/common/trainer/default_trainer.py
@@ -155,10 +155,10 @@ def train(self):
         max_train_step = self.config.get("max_train_step")
         max_eval_step = self.config.get("max_eval_step")
         for idx in range(self.starting_epoch, num_train_epochs, 1):
-            logger.info(f"start train epoch {idx}")
             self.model.train()
             start = time.time()
             total_steps = len(self.train_dataloader)
+            logger.info(f"start train epoch {idx}, total_steps {total_steps}")
             for step, batch in enumerate(self.train_dataloader):
                 with self.accelerator.accumulate(self.model):
                     outputs = self.model(**batch)
@@ -172,8 +172,9 @@ def train(self):
                     if step % logging_steps == 0:
                         loss = loss.item()
                         ppl = math.exp(loss)
+                        epochs = (step + idx * total_steps) / (num_train_epochs * total_steps)
                         logger.info(
-                            f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
+                            f"train epoch:{epochs:.6f}\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
                         )
                         report(
                             {
diff --git a/finetune/finetune.py b/finetune/finetune.py
index 7ab0183db..e79cd3293 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -2,7 +2,7 @@
 
 import os
 import argparse
-from typing import Any, Dict, Union
+from typing import Any, Dict, Union, Optional
 
 import torch
 import accelerate
@@ -63,12 +63,14 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
     return mode_env_vars[mode]
 
 
-def convert_dtype(dtype: str) -> torch.dtype:
-    supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
-    if dtype in supported_dtypes:
-        return supported_dtypes[dtype]
-    else:
-        raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")
+def convert_dtype(dtype: str) -> Optional[torch.dtype]:
+    supported_dtypes = {
+        "fp16": torch.float16,
+        "bf16": torch.bfloat16,
+        "fp32": torch.float32,
+        "no": None,
+    }
+    return supported_dtypes[dtype]
 
 
 def train_func(config: Dict[str, Any]):
diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml
index 285520d82..ed58b4deb 100644
--- a/finetune/finetune.yaml
+++ b/finetune/finetune.yaml
@@ -30,5 +30,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
-  gradient_accumulation_steps: 2
+  gradient_accumulation_steps: 1
   logging_steps: 10
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
index 6a269b7ee..e15e6051c 100644
--- a/finetune/finetune_config.py
+++ b/finetune/finetune_config.py
@@ -56,7 +56,7 @@ class Training(BaseModel):
     resources_per_worker: RayResourceConfig
     accelerate_mode: str
     mixed_precision: str = "no"
-    gradient_accumulation_steps: int
+    gradient_accumulation_steps: int = 1
     logging_steps: int = 10
 
     @validator("device")
@@ -73,6 +73,13 @@ def check_accelerate_mode(cls, v: str):
             raise ValueError(f"accelerate_mode must be one of {modes}")
         return v
 
+    @validator("mixed_precision")
+    def check_mixed_precision(cls, v: str):
+        supported_precisions = ["no", "fp16", "bf16", "fp32"]
+        if v not in supported_precisions:
+            raise ValueError(f"mixed_precision must be on of {supported_precisions}")
+        return v
+
     @validator("logging_steps")
     def check_logging_steps(cls, v: int):
         assert v > 0

From f215f4ad4a59d164ef27f2813a22246e8697fff5 Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Mon, 5 Feb 2024 18:24:28 +0000
Subject: [PATCH 02/10] update

---
 .github/workflows/workflow_finetune_gpu.yml        |  8 ++++----
 examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml |  5 +++++
 finetune/models/gpt-j-6b.yaml                      |  4 ++++
 finetune/models/gpt2.yaml                          |  4 ++++
 finetune/models/llama-2-7b-chat-hf.yaml            | 11 +++++++++--
 finetune/models/mistral-7b-v0.1.yaml               |  8 ++++++--
 finetune/models/mpt-7b-chat.yaml                   |  4 ++++
 7 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
index 561522c27..416cbd458 100644
--- a/.github/workflows/workflow_finetune_gpu.yml
+++ b/.github/workflows/workflow_finetune_gpu.yml
@@ -8,17 +8,17 @@ on:
         default: '10.1.2.13:5000/llmray-build'
       http_proxy:
         type: string
-        default: 'http://proxy-chain.intel.com:911'
+        default: 'http://10.24.221.149:911'
       https_proxy:
         type: string
-        default: 'http://proxy-chain.intel.com:911'
+        default: 'http://10.24.221.149:911'
 
 jobs:
   finetune:
     name: finetune on gpu test
     strategy:
       matrix:
-        model: [ pythia-6.9b, gpt-j-6b ]
+        model: [ meta-llama/Llama-2-7b-chat-hf, mistralai/Mistral-7B-v0.1 ]
     runs-on: self-hosted
 
     defaults:
@@ -43,4 +43,4 @@ jobs:
           cd ~/borealis-runner/
           python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
       - name: Test Summary
-        run: echo "to be continued"
\ No newline at end of file
+        run: echo "to be continued"
diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
index 41303f615..78d126b55 100644
--- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
+++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
@@ -3,6 +3,7 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
+  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
@@ -11,6 +12,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data.jsonl
   validation_file: null
@@ -22,9 +24,12 @@ Training:
   learning_rate: 1.0e-05
   lr_scheduler: linear
   weight_decay: 0.0
+  mixed_precision: bf16
   device: GPU
   num_training_workers: 2
   accelerate_mode: GPU_DDP
   resources_per_worker:
     CPU: 1
     GPU: 1
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml
index f0092022d..ed58b4deb 100644
--- a/finetune/models/gpt-j-6b.yaml
+++ b/finetune/models/gpt-j-6b.yaml
@@ -3,6 +3,7 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
+  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
@@ -11,6 +12,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +30,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml
index 0f9dbf9a8..e484a05e2 100644
--- a/finetune/models/gpt2.yaml
+++ b/finetune/models/gpt2.yaml
@@ -3,6 +3,7 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
+  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
@@ -11,6 +12,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +30,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml
index 56348b2d1..2b45ad6f7 100644
--- a/finetune/models/llama-2-7b-chat-hf.yaml
+++ b/finetune/models/llama-2-7b-chat-hf.yaml
@@ -1,8 +1,9 @@
 General:
   base_model: meta-llama/Llama-2-7b-chat-hf
   gpt_base_model: false
-  output_dir: /tmp/llm-ray/output
-  checkpoint_dir: /tmp/llm-ray/checkpoint
+  output_dir: ./output
+  checkpoint_dir: ./checkpoint
+  tracking_dir: ./tracking
   config:
     trust_remote_code: false
     use_auth_token: null
@@ -11,6 +12,10 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+    target_modules:
+    - q_proj
+    - v_proj
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +33,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml
index 8e3eec5ce..7196aec82 100644
--- a/finetune/models/mistral-7b-v0.1.yaml
+++ b/finetune/models/mistral-7b-v0.1.yaml
@@ -1,8 +1,9 @@
 General:
   base_model: mistralai/Mistral-7B-v0.1
   gpt_base_model: false
-  output_dir: /tmp/llm-ray/output
-  checkpoint_dir: /tmp/llm-ray/checkpoint
+  output_dir: ./output
+  checkpoint_dir: ./checkpoint
+  tracking_dir: ./tracking
   config:
     trust_remote_code: false
     use_auth_token: null
@@ -20,6 +21,7 @@ General:
     - up_proj
     - down_proj
     - lm_head
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -37,3 +39,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml
index e8f04d209..8218b9ad6 100644
--- a/finetune/models/mpt-7b-chat.yaml
+++ b/finetune/models/mpt-7b-chat.yaml
@@ -3,6 +3,7 @@ General:
   gpt_base_model: false
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
+  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: true
     use_auth_token: null
@@ -11,6 +12,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +30,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10

From c46650d6b477ca35c0703e3d5c08822ac35f68e5 Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Mon, 5 Feb 2024 18:27:25 +0000
Subject: [PATCH 03/10] update

---
 common/trainer/default_trainer.py | 2 +-
 finetune/finetune_config.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
index 7d76aafa6..4aa13b6eb 100644
--- a/common/trainer/default_trainer.py
+++ b/common/trainer/default_trainer.py
@@ -158,7 +158,7 @@ def train(self):
             self.model.train()
             start = time.time()
             total_steps = len(self.train_dataloader)
-            logger.info(f"start train epoch {idx}, total_steps {total_steps}")
+            logger.info(f"Start training epoch {idx}, total_steps {total_steps}")
             for step, batch in enumerate(self.train_dataloader):
                 with self.accelerator.accumulate(self.model):
                     outputs = self.model(**batch)
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
index e15e6051c..779a488c7 100644
--- a/finetune/finetune_config.py
+++ b/finetune/finetune_config.py
@@ -77,7 +77,7 @@ def check_accelerate_mode(cls, v: str):
     def check_mixed_precision(cls, v: str):
         supported_precisions = ["no", "fp16", "bf16", "fp32"]
         if v not in supported_precisions:
-            raise ValueError(f"mixed_precision must be on of {supported_precisions}")
+            raise ValueError(f"mixed_precision must be one of {supported_precisions}")
         return v
 
     @validator("logging_steps")

From 02535dacc25b1f95bb8a3bd2bdae73b75ddcf128 Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Mon, 5 Feb 2024 18:43:03 +0000
Subject: [PATCH 04/10] update

---
 docs/finetune_parameters.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
index a10bb8c33..6c9a5dbd3 100644
--- a/docs/finetune_parameters.md
+++ b/docs/finetune_parameters.md
@@ -34,7 +34,7 @@ The following are the parameters supported in the finetuning workflow.
 |learning_rate|1e-5|Initial learning rate to use.|
 |lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"|
 |weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.|
-|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set.
+|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp32". Default is "no" if not set.
 |device|CPU|The device type used, can be "CPU", "GPU".|
 |num_training_workers|2|The number of the training process.|
 |resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.|

From c494983709660234962647f5eb463e36e1160ca3 Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Tue, 6 Feb 2024 13:16:16 +0000
Subject: [PATCH 05/10] remove accelrator tracking, replace by ray train.report

---
 common/trainer/default_trainer.py                  | 13 ++-----------
 docs/finetune_parameters.md                        |  1 -
 examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml |  1 -
 finetune/finetune.py                               | 10 ----------
 finetune/finetune.yaml                             |  1 -
 finetune/finetune_config.py                        |  1 -
 finetune/models/gpt-j-6b.yaml                      |  1 -
 finetune/models/gpt2.yaml                          |  1 -
 finetune/models/llama-2-7b-chat-hf.yaml            |  1 -
 finetune/models/mistral-7b-v0.1.yaml               |  1 -
 finetune/models/mpt-7b-chat.yaml                   |  1 -
 11 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
index 4aa13b6eb..5ed24f677 100644
--- a/common/trainer/default_trainer.py
+++ b/common/trainer/default_trainer.py
@@ -178,8 +178,8 @@ def train(self):
                         )
                         report(
                             {
-                                "loss": loss,
-                                "ppl": ppl,
+                                "train_loss": loss,
+                                "train_ppl": ppl,
                                 "train_epoch": idx,
                                 "total_epochs": num_train_epochs,
                                 "train_step": step,
@@ -188,10 +188,6 @@ def train(self):
                                 else total_steps,
                             }
                         )
-                        self.accelerator.log(
-                            {"train loss": loss, "train perplexity": ppl},
-                            step=idx * total_steps + step,
-                        )
                         start = time.time()
                 if max_train_step is not None:
                     if step >= max_train_step - 1:
@@ -222,9 +218,6 @@ def train(self):
                 except OverflowError:
                     eval_loss = float("inf")
                     perplexity = float("inf")
-                self.accelerator.log(
-                    {"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
-                )
                 logger.info(
                     f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
                 )
@@ -244,8 +237,6 @@ def train(self):
             )
             logger.info(f"finish save model to {output}")
 
-        self.accelerator.end_training()
-
         self.accelerator.wait_for_everyone()
 
     def _get_local_path(self, root_path, model_name):
diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
index 6c9a5dbd3..00168767b 100644
--- a/docs/finetune_parameters.md
+++ b/docs/finetune_parameters.md
@@ -10,7 +10,6 @@ The following are the parameters supported in the finetuning workflow.
 |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
 |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
 |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
-|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
 |config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
 |lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
 |deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
index 78d126b55..3cfa96913 100644
--- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
+++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
-  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/finetune/finetune.py b/finetune/finetune.py
index e79cd3293..98586aecc 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -91,24 +91,14 @@ def train_func(config: Dict[str, Any]):
     else:
         fsdp_plugin = None
 
-    log_with = "tensorboard"  # only support tensorboard as tracker
     output_dir = config["General"]["output_dir"]
-    tracking_dir = config["General"]["tracking_dir"]
     accelerator = accelerate.Accelerator(
         gradient_accumulation_steps=gradient_accumulation_steps,
         fsdp_plugin=fsdp_plugin,
-        log_with=log_with,
-        project_dir=tracking_dir,
     )
     epochs = config["Training"]["epochs"]
-    tracker_config = {
-        "epochs": epochs,
-        "learning_rate": config["Training"]["learning_rate"],
-        "batch_size": config["Training"]["batch_size"],
-    }
     base_model = config["General"]["base_model"]
     dataset_file = config["Dataset"]["train_file"]
-    accelerator.init_trackers("fine-tuning", config=tracker_config)
 
     common.logger.info(
         f"accelerator generate finish, accelerator device type = {accelerator.device}"
diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml
index ed58b4deb..4f27b2ebd 100644
--- a/finetune/finetune.yaml
+++ b/finetune/finetune.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
-  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
index 779a488c7..819e8b1d3 100644
--- a/finetune/finetune_config.py
+++ b/finetune/finetune_config.py
@@ -26,7 +26,6 @@ class General(BaseModel):
     gpt_base_model: bool
     output_dir: str
     checkpoint_dir: str
-    tracking_dir: str
     config: GeneralConfig
     lora_config: Optional[LoraConfig] = None
     deltatuner_config: Optional[DeltatunerConfig] = None
diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml
index ed58b4deb..4f27b2ebd 100644
--- a/finetune/models/gpt-j-6b.yaml
+++ b/finetune/models/gpt-j-6b.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
-  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml
index e484a05e2..03048ee29 100644
--- a/finetune/models/gpt2.yaml
+++ b/finetune/models/gpt2.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: true
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
-  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml
index 2b45ad6f7..014c06b0b 100644
--- a/finetune/models/llama-2-7b-chat-hf.yaml
+++ b/finetune/models/llama-2-7b-chat-hf.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: false
   output_dir: ./output
   checkpoint_dir: ./checkpoint
-  tracking_dir: ./tracking
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml
index 7196aec82..78f355b37 100644
--- a/finetune/models/mistral-7b-v0.1.yaml
+++ b/finetune/models/mistral-7b-v0.1.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: false
   output_dir: ./output
   checkpoint_dir: ./checkpoint
-  tracking_dir: ./tracking
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml
index 8218b9ad6..e84c4116a 100644
--- a/finetune/models/mpt-7b-chat.yaml
+++ b/finetune/models/mpt-7b-chat.yaml
@@ -3,7 +3,6 @@ General:
   gpt_base_model: false
   output_dir: /tmp/llm-ray/output
   checkpoint_dir: /tmp/llm-ray/checkpoint
-  tracking_dir: /tmp/llm-ray/tracking
   config:
     trust_remote_code: true
     use_auth_token: null

From ae3efbe50e913c0690ad0e7dbd7d42859a8549ce Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Tue, 6 Feb 2024 15:46:23 +0000
Subject: [PATCH 06/10] update

---
 finetune/finetune.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/finetune/finetune.py b/finetune/finetune.py
index 98586aecc..c64eeeb4f 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -126,9 +126,11 @@ def train_func(config: Dict[str, Any]):
     model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
         config={
             "name": base_model,
-            "dtype": convert_dtype(config["Training"]["mixed_precision"]),
+            "dtype": convert_dtype(config["Training"].get("mixed_precision", "no")),
             "config": config["General"]["config"],
-            "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
+            "enable_gradient_checkpointing": config["General"].get(
+                "enable_gradient_checkpointing", "no"
+            ),
             "lora_config": config["General"]["lora_config"]
             if config["General"].get("lora_config")
             else None,

From 59c757e9817266124bf6747c52be87a29d385b20 Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Mon, 19 Feb 2024 16:04:48 +0000
Subject: [PATCH 07/10] update

---
 docs/finetune_parameters.md                   | 2 +-
 finetune/finetune.py                          | 1 -
 finetune/finetune_config.py                   | 2 +-
 finetune/models/bloom-560m.yaml               | 3 +++
 finetune/models/finetune_config_template.yaml | 3 +++
 finetune/models/llama-2-7b-chat-hf.yaml       | 4 ++--
 finetune/models/mistral-7b-v0.1.yaml          | 4 ++--
 7 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
index 00168767b..69a906e86 100644
--- a/docs/finetune_parameters.md
+++ b/docs/finetune_parameters.md
@@ -33,7 +33,7 @@ The following are the parameters supported in the finetuning workflow.
 |learning_rate|1e-5|Initial learning rate to use.|
 |lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"|
 |weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.|
-|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp32". Default is "no" if not set.
+|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16". Default is "no" if not set.
 |device|CPU|The device type used, can be "CPU", "GPU".|
 |num_training_workers|2|The number of the training process.|
 |resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.|
diff --git a/finetune/finetune.py b/finetune/finetune.py
index c64eeeb4f..577695c8c 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -67,7 +67,6 @@ def convert_dtype(dtype: str) -> Optional[torch.dtype]:
     supported_dtypes = {
         "fp16": torch.float16,
         "bf16": torch.bfloat16,
-        "fp32": torch.float32,
         "no": None,
     }
     return supported_dtypes[dtype]
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
index 819e8b1d3..c24114394 100644
--- a/finetune/finetune_config.py
+++ b/finetune/finetune_config.py
@@ -74,7 +74,7 @@ def check_accelerate_mode(cls, v: str):
 
     @validator("mixed_precision")
     def check_mixed_precision(cls, v: str):
-        supported_precisions = ["no", "fp16", "bf16", "fp32"]
+        supported_precisions = ["no", "fp16", "bf16"]
         if v not in supported_precisions:
             raise ValueError(f"mixed_precision must be one of {supported_precisions}")
         return v
diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml
index c2999ce7a..e96ee51fb 100644
--- a/finetune/models/bloom-560m.yaml
+++ b/finetune/models/bloom-560m.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml
index f0092022d..4f27b2ebd 100644
--- a/finetune/models/finetune_config_template.yaml
+++ b/finetune/models/finetune_config_template.yaml
@@ -11,6 +11,7 @@ General:
     r: 8
     lora_alpha: 32
     lora_dropout: 0.1
+  enable_gradient_checkpointing: false
 Dataset:
   train_file: examples/data/sample_finetune_data_small.jsonl
   validation_file: null
@@ -28,3 +29,5 @@ Training:
   resources_per_worker:
     CPU: 32
   accelerate_mode: CPU_DDP
+  gradient_accumulation_steps: 1
+  logging_steps: 10
diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml
index 014c06b0b..d9a304ee4 100644
--- a/finetune/models/llama-2-7b-chat-hf.yaml
+++ b/finetune/models/llama-2-7b-chat-hf.yaml
@@ -1,8 +1,8 @@
 General:
   base_model: meta-llama/Llama-2-7b-chat-hf
   gpt_base_model: false
-  output_dir: ./output
-  checkpoint_dir: ./checkpoint
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
   config:
     trust_remote_code: false
     use_auth_token: null
diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml
index 78f355b37..b7003c31b 100644
--- a/finetune/models/mistral-7b-v0.1.yaml
+++ b/finetune/models/mistral-7b-v0.1.yaml
@@ -1,8 +1,8 @@
 General:
   base_model: mistralai/Mistral-7B-v0.1
   gpt_base_model: false
-  output_dir: ./output
-  checkpoint_dir: ./checkpoint
+  output_dir: /tmp/llm-ray/output
+  checkpoint_dir: /tmp/llm-ray/checkpoint
   config:
     trust_remote_code: false
     use_auth_token: null

From eba891ee7a4f94bd283641e0d81b5cc5e0707622 Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Tue, 20 Feb 2024 09:10:08 +0000
Subject: [PATCH 08/10] update

---
 .../config/update_finetune_config_on_intel_gpu.py    | 12 ++++++------
 .github/workflows/workflow_finetune_gpu.yml          |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
index e46dda811..eb19ad35b 100644
--- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py
+++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
@@ -2,9 +2,8 @@
 import argparse
 
 
-def update_finetune_config(base_model):
-    conf_file = "finetune/finetune.yaml"
-    with open(conf_file) as f:
+def update_finetune_config(config_file, base_model):
+    with open(config_file) as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
         # due to compute node can't connect network
         # base models are downloaded as local files in directory ~/models/
@@ -23,18 +22,19 @@ def update_finetune_config(base_model):
         # pythia-6.9b
 
         config["General"]["base_model"] = base_model
-        # config["General"]["base_model"] = "pythia-70m"
         config["Training"]["device"] = "GPU"
         config["Training"]["resources_per_worker"]["CPU"] = 1
         config["Training"]["resources_per_worker"]["GPU"] = 1
         config["Training"]["accelerate_mode"] = "GPU_DDP"
+        config["Training"]["logging_steps"] = 1
 
-    with open(conf_file, "w") as f:
+    with open(config_file, "w") as f:
         yaml.dump(config, f, sort_keys=False)
 
 
 def get_parser():
     parser = argparse.ArgumentParser(description="Finetuning on Intel GPU")
+    parser.add_argument("--config_file", type=str, required=True, default=None)
     parser.add_argument("--base_model", type=str, required=True, default=None)
     return parser
 
@@ -43,4 +43,4 @@ def get_parser():
     parser = get_parser()
     args = parser.parse_args()
 
-    update_finetune_config(args.base_model)
+    update_finetune_config(args.config_file, args.base_model)
diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
index 416cbd458..57a32bc06 100644
--- a/.github/workflows/workflow_finetune_gpu.yml
+++ b/.github/workflows/workflow_finetune_gpu.yml
@@ -18,7 +18,7 @@ jobs:
     name: finetune on gpu test
     strategy:
       matrix:
-        model: [ meta-llama/Llama-2-7b-chat-hf, mistralai/Mistral-7B-v0.1 ]
+        model: [ meta-llama/Llama-2-7b-chat-hf ]
     runs-on: self-hosted
 
     defaults:

From 884d1f31fc1daba731a95c4ab763d875f6d778b5 Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Tue, 20 Feb 2024 16:37:41 +0000
Subject: [PATCH 09/10] update

---
 .github/workflows/config/update_finetune_config_on_intel_gpu.py | 2 ++
 .github/workflows/workflow_finetune_gpu.yml                     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
index eb19ad35b..38d49143b 100644
--- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py
+++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
@@ -22,6 +22,8 @@ def update_finetune_config(config_file, base_model):
         # pythia-6.9b
 
         config["General"]["base_model"] = base_model
+        config["General"]["output_dir"] = "./output"
+        config["General"]["checkpoint_dir"] = "./checkpoint"
         config["Training"]["device"] = "GPU"
         config["Training"]["resources_per_worker"]["CPU"] = 1
         config["Training"]["resources_per_worker"]["GPU"] = 1
diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
index 57a32bc06..4dcb43da3 100644
--- a/.github/workflows/workflow_finetune_gpu.yml
+++ b/.github/workflows/workflow_finetune_gpu.yml
@@ -41,6 +41,6 @@ jobs:
           rm ~/borealis-runner/llm-on-ray.tar.gz -f
           tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
           cd ~/borealis-runner/
-          python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
+          python3 finetune_on_pvc.py --need_create_conda_env true --base_models "${{ matrix.model }}"
       - name: Test Summary
         run: echo "to be continued"

From 685ddbc8dd6b37ce99970c08e1116a4361c5542f Mon Sep 17 00:00:00 2001
From: "Wu, Gangsheng" <gangsheng.wu@intel.com>
Date: Wed, 21 Feb 2024 13:44:24 +0000
Subject: [PATCH 10/10] update

---
 finetune/finetune.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/finetune/finetune.py b/finetune/finetune.py
index 577695c8c..f6d2f15c6 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -128,7 +128,7 @@ def train_func(config: Dict[str, Any]):
             "dtype": convert_dtype(config["Training"].get("mixed_precision", "no")),
             "config": config["General"]["config"],
             "enable_gradient_checkpointing": config["General"].get(
-                "enable_gradient_checkpointing", "no"
+                "enable_gradient_checkpointing", False
             ),
             "lora_config": config["General"]["lora_config"]
             if config["General"].get("lora_config")