diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
index e46dda811..38d49143b 100644
--- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py
+++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py
@@ -2,9 +2,8 @@
import argparse
-def update_finetune_config(base_model):
- conf_file = "finetune/finetune.yaml"
- with open(conf_file) as f:
+def update_finetune_config(config_file, base_model):
+ with open(config_file) as f:
config = yaml.load(f, Loader=yaml.FullLoader)
# due to compute node can't connect network
# base models are downloaded as local files in directory ~/models/
@@ -23,18 +22,21 @@ def update_finetune_config(base_model):
# pythia-6.9b
config["General"]["base_model"] = base_model
- # config["General"]["base_model"] = "pythia-70m"
+ config["General"]["output_dir"] = "./output"
+ config["General"]["checkpoint_dir"] = "./checkpoint"
config["Training"]["device"] = "GPU"
config["Training"]["resources_per_worker"]["CPU"] = 1
config["Training"]["resources_per_worker"]["GPU"] = 1
config["Training"]["accelerate_mode"] = "GPU_DDP"
+ config["Training"]["logging_steps"] = 1
- with open(conf_file, "w") as f:
+ with open(config_file, "w") as f:
yaml.dump(config, f, sort_keys=False)
def get_parser():
parser = argparse.ArgumentParser(description="Finetuning on Intel GPU")
+ parser.add_argument("--config_file", type=str, required=True, default=None)
parser.add_argument("--base_model", type=str, required=True, default=None)
return parser
@@ -43,4 +45,4 @@ def get_parser():
parser = get_parser()
args = parser.parse_args()
- update_finetune_config(args.base_model)
+ update_finetune_config(args.config_file, args.base_model)
diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml
index 561522c27..4dcb43da3 100644
--- a/.github/workflows/workflow_finetune_gpu.yml
+++ b/.github/workflows/workflow_finetune_gpu.yml
@@ -8,17 +8,17 @@ on:
default: '10.1.2.13:5000/llmray-build'
http_proxy:
type: string
- default: 'http://proxy-chain.intel.com:911'
+ default: 'http://10.24.221.149:911'
https_proxy:
type: string
- default: 'http://proxy-chain.intel.com:911'
+ default: 'http://10.24.221.149:911'
jobs:
finetune:
name: finetune on gpu test
strategy:
matrix:
- model: [ pythia-6.9b, gpt-j-6b ]
+ model: [ meta-llama/Llama-2-7b-chat-hf ]
runs-on: self-hosted
defaults:
@@ -41,6 +41,6 @@ jobs:
rm ~/borealis-runner/llm-on-ray.tar.gz -f
tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
cd ~/borealis-runner/
- python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
+ python3 finetune_on_pvc.py --need_create_conda_env true --base_models "${{ matrix.model }}"
- name: Test Summary
- run: echo "to be continued"
\ No newline at end of file
+ run: echo "to be continued"
diff --git a/common/dataset/huggingface_dataset.py b/common/dataset/huggingface_dataset.py
index 9173e067f..3b9214aaf 100644
--- a/common/dataset/huggingface_dataset.py
+++ b/common/dataset/huggingface_dataset.py
@@ -25,7 +25,7 @@ def __call__(self, config):
if validation_file is not None:
validation_dataset = local_load(validation_file)
return datasets.DatasetDict(
- {"train": train_dataset, "validation_dataset": validation_dataset}
+ {"train": train_dataset, "validation": validation_dataset}
)
if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0:
datasets_dict = train_dataset.train_test_split(
diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py
index f05c6317f..5ed24f677 100644
--- a/common/trainer/default_trainer.py
+++ b/common/trainer/default_trainer.py
@@ -155,10 +155,10 @@ def train(self):
max_train_step = self.config.get("max_train_step")
max_eval_step = self.config.get("max_eval_step")
for idx in range(self.starting_epoch, num_train_epochs, 1):
- logger.info(f"start train epoch {idx}")
self.model.train()
start = time.time()
total_steps = len(self.train_dataloader)
+ logger.info(f"Start training epoch {idx}, total_steps {total_steps}")
for step, batch in enumerate(self.train_dataloader):
with self.accelerator.accumulate(self.model):
outputs = self.model(**batch)
@@ -172,13 +172,14 @@ def train(self):
if step % logging_steps == 0:
loss = loss.item()
ppl = math.exp(loss)
+ epochs = (step + idx * total_steps) / (num_train_epochs * total_steps)
logger.info(
- f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
+ f"train epoch:{epochs:.6f}\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
)
report(
{
- "loss": loss,
- "ppl": ppl,
+ "train_loss": loss,
+ "train_ppl": ppl,
"train_epoch": idx,
"total_epochs": num_train_epochs,
"train_step": step,
@@ -187,10 +188,6 @@ def train(self):
else total_steps,
}
)
- self.accelerator.log(
- {"train loss": loss, "train perplexity": ppl},
- step=idx * total_steps + step,
- )
start = time.time()
if max_train_step is not None:
if step >= max_train_step - 1:
@@ -221,9 +218,6 @@ def train(self):
except OverflowError:
eval_loss = float("inf")
perplexity = float("inf")
- self.accelerator.log(
- {"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
- )
logger.info(
f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
)
@@ -243,8 +237,6 @@ def train(self):
)
logger.info(f"finish save model to {output}")
- self.accelerator.end_training()
-
self.accelerator.wait_for_everyone()
def _get_local_path(self, root_path, model_name):
diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md
index a10bb8c33..69a906e86 100644
--- a/docs/finetune_parameters.md
+++ b/docs/finetune_parameters.md
@@ -10,7 +10,6 @@ The following are the parameters supported in the finetuning workflow.
|gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
|output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
|checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
-|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
|config|trust_remote_code: False
use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
|lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
|deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
@@ -34,7 +33,7 @@ The following are the parameters supported in the finetuning workflow.
|learning_rate|1e-5|Initial learning rate to use.|
|lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"|
|weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.|
-|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set.
+|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16". Default is "no" if not set.
|device|CPU|The device type used, can be "CPU", "GPU".|
|num_training_workers|2|The number of the training process.|
|resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.|
diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
index 41303f615..3cfa96913 100644
--- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
+++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
@@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data.jsonl
validation_file: null
@@ -22,9 +23,12 @@ Training:
learning_rate: 1.0e-05
lr_scheduler: linear
weight_decay: 0.0
+ mixed_precision: bf16
device: GPU
num_training_workers: 2
accelerate_mode: GPU_DDP
resources_per_worker:
CPU: 1
GPU: 1
+ gradient_accumulation_steps: 1
+ logging_steps: 10
diff --git a/finetune/finetune.py b/finetune/finetune.py
index 7ab0183db..f6d2f15c6 100644
--- a/finetune/finetune.py
+++ b/finetune/finetune.py
@@ -2,7 +2,7 @@
import os
import argparse
-from typing import Any, Dict, Union
+from typing import Any, Dict, Union, Optional
import torch
import accelerate
@@ -63,12 +63,13 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
return mode_env_vars[mode]
-def convert_dtype(dtype: str) -> torch.dtype:
- supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
- if dtype in supported_dtypes:
- return supported_dtypes[dtype]
- else:
- raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")
+def convert_dtype(dtype: str) -> Optional[torch.dtype]:
+ supported_dtypes = {
+ "fp16": torch.float16,
+ "bf16": torch.bfloat16,
+ "no": None,
+ }
+ return supported_dtypes[dtype]
def train_func(config: Dict[str, Any]):
@@ -89,24 +90,14 @@ def train_func(config: Dict[str, Any]):
else:
fsdp_plugin = None
- log_with = "tensorboard" # only support tensorboard as tracker
output_dir = config["General"]["output_dir"]
- tracking_dir = config["General"]["tracking_dir"]
accelerator = accelerate.Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps,
fsdp_plugin=fsdp_plugin,
- log_with=log_with,
- project_dir=tracking_dir,
)
epochs = config["Training"]["epochs"]
- tracker_config = {
- "epochs": epochs,
- "learning_rate": config["Training"]["learning_rate"],
- "batch_size": config["Training"]["batch_size"],
- }
base_model = config["General"]["base_model"]
dataset_file = config["Dataset"]["train_file"]
- accelerator.init_trackers("fine-tuning", config=tracker_config)
common.logger.info(
f"accelerator generate finish, accelerator device type = {accelerator.device}"
@@ -134,9 +125,11 @@ def train_func(config: Dict[str, Any]):
model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
config={
"name": base_model,
- "dtype": convert_dtype(config["Training"]["mixed_precision"]),
+ "dtype": convert_dtype(config["Training"].get("mixed_precision", "no")),
"config": config["General"]["config"],
- "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
+ "enable_gradient_checkpointing": config["General"].get(
+ "enable_gradient_checkpointing", False
+ ),
"lora_config": config["General"]["lora_config"]
if config["General"].get("lora_config")
else None,
diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml
index 285520d82..4f27b2ebd 100644
--- a/finetune/finetune.yaml
+++ b/finetune/finetune.yaml
@@ -3,7 +3,6 @@ General:
gpt_base_model: true
output_dir: /tmp/llm-ray/output
checkpoint_dir: /tmp/llm-ray/checkpoint
- tracking_dir: /tmp/llm-ray/tracking
config:
trust_remote_code: false
use_auth_token: null
@@ -30,5 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
- gradient_accumulation_steps: 2
+ gradient_accumulation_steps: 1
logging_steps: 10
diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py
index 6a269b7ee..c24114394 100644
--- a/finetune/finetune_config.py
+++ b/finetune/finetune_config.py
@@ -26,7 +26,6 @@ class General(BaseModel):
gpt_base_model: bool
output_dir: str
checkpoint_dir: str
- tracking_dir: str
config: GeneralConfig
lora_config: Optional[LoraConfig] = None
deltatuner_config: Optional[DeltatunerConfig] = None
@@ -56,7 +55,7 @@ class Training(BaseModel):
resources_per_worker: RayResourceConfig
accelerate_mode: str
mixed_precision: str = "no"
- gradient_accumulation_steps: int
+ gradient_accumulation_steps: int = 1
logging_steps: int = 10
@validator("device")
@@ -73,6 +72,13 @@ def check_accelerate_mode(cls, v: str):
raise ValueError(f"accelerate_mode must be one of {modes}")
return v
+ @validator("mixed_precision")
+ def check_mixed_precision(cls, v: str):
+ supported_precisions = ["no", "fp16", "bf16"]
+ if v not in supported_precisions:
+ raise ValueError(f"mixed_precision must be one of {supported_precisions}")
+ return v
+
@validator("logging_steps")
def check_logging_steps(cls, v: int):
assert v > 0
diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml
index c2999ce7a..e96ee51fb 100644
--- a/finetune/models/bloom-560m.yaml
+++ b/finetune/models/bloom-560m.yaml
@@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
@@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
+ gradient_accumulation_steps: 1
+ logging_steps: 10
diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml
index f0092022d..4f27b2ebd 100644
--- a/finetune/models/finetune_config_template.yaml
+++ b/finetune/models/finetune_config_template.yaml
@@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
@@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
+ gradient_accumulation_steps: 1
+ logging_steps: 10
diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml
index f0092022d..4f27b2ebd 100644
--- a/finetune/models/gpt-j-6b.yaml
+++ b/finetune/models/gpt-j-6b.yaml
@@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
@@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
+ gradient_accumulation_steps: 1
+ logging_steps: 10
diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml
index 0f9dbf9a8..03048ee29 100644
--- a/finetune/models/gpt2.yaml
+++ b/finetune/models/gpt2.yaml
@@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
@@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
+ gradient_accumulation_steps: 1
+ logging_steps: 10
diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml
index 56348b2d1..d9a304ee4 100644
--- a/finetune/models/llama-2-7b-chat-hf.yaml
+++ b/finetune/models/llama-2-7b-chat-hf.yaml
@@ -11,6 +11,10 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
+ target_modules:
+ - q_proj
+ - v_proj
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
@@ -28,3 +32,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
+ gradient_accumulation_steps: 1
+ logging_steps: 10
diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml
index 8e3eec5ce..b7003c31b 100644
--- a/finetune/models/mistral-7b-v0.1.yaml
+++ b/finetune/models/mistral-7b-v0.1.yaml
@@ -20,6 +20,7 @@ General:
- up_proj
- down_proj
- lm_head
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
@@ -37,3 +38,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
+ gradient_accumulation_steps: 1
+ logging_steps: 10
diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml
index e8f04d209..e84c4116a 100644
--- a/finetune/models/mpt-7b-chat.yaml
+++ b/finetune/models/mpt-7b-chat.yaml
@@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
+ enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
@@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
+ gradient_accumulation_steps: 1
+ logging_steps: 10