diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py index e46dda811..38d49143b 100644 --- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py +++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py @@ -2,9 +2,8 @@ import argparse -def update_finetune_config(base_model): - conf_file = "finetune/finetune.yaml" - with open(conf_file) as f: +def update_finetune_config(config_file, base_model): + with open(config_file) as f: config = yaml.load(f, Loader=yaml.FullLoader) # due to compute node can't connect network # base models are downloaded as local files in directory ~/models/ @@ -23,18 +22,21 @@ def update_finetune_config(base_model): # pythia-6.9b config["General"]["base_model"] = base_model - # config["General"]["base_model"] = "pythia-70m" + config["General"]["output_dir"] = "./output" + config["General"]["checkpoint_dir"] = "./checkpoint" config["Training"]["device"] = "GPU" config["Training"]["resources_per_worker"]["CPU"] = 1 config["Training"]["resources_per_worker"]["GPU"] = 1 config["Training"]["accelerate_mode"] = "GPU_DDP" + config["Training"]["logging_steps"] = 1 - with open(conf_file, "w") as f: + with open(config_file, "w") as f: yaml.dump(config, f, sort_keys=False) def get_parser(): parser = argparse.ArgumentParser(description="Finetuning on Intel GPU") + parser.add_argument("--config_file", type=str, required=True, default=None) parser.add_argument("--base_model", type=str, required=True, default=None) return parser @@ -43,4 +45,4 @@ def get_parser(): parser = get_parser() args = parser.parse_args() - update_finetune_config(args.base_model) + update_finetune_config(args.config_file, args.base_model) diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml index 561522c27..4dcb43da3 100644 --- a/.github/workflows/workflow_finetune_gpu.yml +++ b/.github/workflows/workflow_finetune_gpu.yml @@ -8,17 +8,17 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://proxy-chain.intel.com:911' + default: 'http://10.24.221.149:911' https_proxy: type: string - default: 'http://proxy-chain.intel.com:911' + default: 'http://10.24.221.149:911' jobs: finetune: name: finetune on gpu test strategy: matrix: - model: [ pythia-6.9b, gpt-j-6b ] + model: [ meta-llama/Llama-2-7b-chat-hf ] runs-on: self-hosted defaults: @@ -41,6 +41,6 @@ jobs: rm ~/borealis-runner/llm-on-ray.tar.gz -f tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray . cd ~/borealis-runner/ - python3 finetune_on_pvc.py --base_model "${{ matrix.model }}" + python3 finetune_on_pvc.py --need_create_conda_env true --base_models "${{ matrix.model }}" - name: Test Summary - run: echo "to be continued" \ No newline at end of file + run: echo "to be continued" diff --git a/common/dataset/huggingface_dataset.py b/common/dataset/huggingface_dataset.py index 9173e067f..3b9214aaf 100644 --- a/common/dataset/huggingface_dataset.py +++ b/common/dataset/huggingface_dataset.py @@ -25,7 +25,7 @@ def __call__(self, config): if validation_file is not None: validation_dataset = local_load(validation_file) return datasets.DatasetDict( - {"train": train_dataset, "validation_dataset": validation_dataset} + {"train": train_dataset, "validation": validation_dataset} ) if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0: datasets_dict = train_dataset.train_test_split( diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py index f05c6317f..5ed24f677 100644 --- a/common/trainer/default_trainer.py +++ b/common/trainer/default_trainer.py @@ -155,10 +155,10 @@ def train(self): max_train_step = self.config.get("max_train_step") max_eval_step = self.config.get("max_eval_step") for idx in range(self.starting_epoch, num_train_epochs, 1): - logger.info(f"start train epoch {idx}") self.model.train() start = time.time() total_steps = len(self.train_dataloader) + logger.info(f"Start training epoch {idx}, total_steps {total_steps}") for step, batch in enumerate(self.train_dataloader): with self.accelerator.accumulate(self.model): outputs = self.model(**batch) @@ -172,13 +172,14 @@ def train(self): if step % logging_steps == 0: loss = loss.item() ppl = math.exp(loss) + epochs = (step + idx * total_steps) / (num_train_epochs * total_steps) logger.info( - f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}" + f"train epoch:{epochs:.6f}\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}" ) report( { - "loss": loss, - "ppl": ppl, + "train_loss": loss, + "train_ppl": ppl, "train_epoch": idx, "total_epochs": num_train_epochs, "train_step": step, @@ -187,10 +188,6 @@ def train(self): else total_steps, } ) - self.accelerator.log( - {"train loss": loss, "train perplexity": ppl}, - step=idx * total_steps + step, - ) start = time.time() if max_train_step is not None: if step >= max_train_step - 1: @@ -221,9 +218,6 @@ def train(self): except OverflowError: eval_loss = float("inf") perplexity = float("inf") - self.accelerator.log( - {"evaluate loss": eval_loss, "evaluate perplexity": perplexity} - ) logger.info( f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]" ) @@ -243,8 +237,6 @@ def train(self): ) logger.info(f"finish save model to {output}") - self.accelerator.end_training() - self.accelerator.wait_for_everyone() def _get_local_path(self, root_path, model_name): diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index a10bb8c33..69a906e86 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -10,7 +10,6 @@ The following are the parameters supported in the finetuning workflow. |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.| |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model| |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint| -|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers| |config|trust_remote_code: False
use_auth_token: None|Will be passed to the transformers `from_pretrained()` method| |lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| |deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| @@ -34,7 +33,7 @@ The following are the parameters supported in the finetuning workflow. |learning_rate|1e-5|Initial learning rate to use.| |lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"| |weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.| -|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set. +|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16". Default is "no" if not set. |device|CPU|The device type used, can be "CPU", "GPU".| |num_training_workers|2|The number of the training process.| |resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.| diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml index 41303f615..3cfa96913 100644 --- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml +++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data.jsonl validation_file: null @@ -22,9 +23,12 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: GPU num_training_workers: 2 accelerate_mode: GPU_DDP resources_per_worker: CPU: 1 GPU: 1 + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/finetune.py b/finetune/finetune.py index 7ab0183db..f6d2f15c6 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -2,7 +2,7 @@ import os import argparse -from typing import Any, Dict, Union +from typing import Any, Dict, Union, Optional import torch import accelerate @@ -63,12 +63,13 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any], return mode_env_vars[mode] -def convert_dtype(dtype: str) -> torch.dtype: - supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} - if dtype in supported_dtypes: - return supported_dtypes[dtype] - else: - raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]") +def convert_dtype(dtype: str) -> Optional[torch.dtype]: + supported_dtypes = { + "fp16": torch.float16, + "bf16": torch.bfloat16, + "no": None, + } + return supported_dtypes[dtype] def train_func(config: Dict[str, Any]): @@ -89,24 +90,14 @@ def train_func(config: Dict[str, Any]): else: fsdp_plugin = None - log_with = "tensorboard" # only support tensorboard as tracker output_dir = config["General"]["output_dir"] - tracking_dir = config["General"]["tracking_dir"] accelerator = accelerate.Accelerator( gradient_accumulation_steps=gradient_accumulation_steps, fsdp_plugin=fsdp_plugin, - log_with=log_with, - project_dir=tracking_dir, ) epochs = config["Training"]["epochs"] - tracker_config = { - "epochs": epochs, - "learning_rate": config["Training"]["learning_rate"], - "batch_size": config["Training"]["batch_size"], - } base_model = config["General"]["base_model"] dataset_file = config["Dataset"]["train_file"] - accelerator.init_trackers("fine-tuning", config=tracker_config) common.logger.info( f"accelerator generate finish, accelerator device type = {accelerator.device}" @@ -134,9 +125,11 @@ def train_func(config: Dict[str, Any]): model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()( config={ "name": base_model, - "dtype": convert_dtype(config["Training"]["mixed_precision"]), + "dtype": convert_dtype(config["Training"].get("mixed_precision", "no")), "config": config["General"]["config"], - "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"], + "enable_gradient_checkpointing": config["General"].get( + "enable_gradient_checkpointing", False + ), "lora_config": config["General"]["lora_config"] if config["General"].get("lora_config") else None, diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml index 285520d82..4f27b2ebd 100644 --- a/finetune/finetune.yaml +++ b/finetune/finetune.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint - tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null @@ -30,5 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP - gradient_accumulation_steps: 2 + gradient_accumulation_steps: 1 logging_steps: 10 diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py index 6a269b7ee..c24114394 100644 --- a/finetune/finetune_config.py +++ b/finetune/finetune_config.py @@ -26,7 +26,6 @@ class General(BaseModel): gpt_base_model: bool output_dir: str checkpoint_dir: str - tracking_dir: str config: GeneralConfig lora_config: Optional[LoraConfig] = None deltatuner_config: Optional[DeltatunerConfig] = None @@ -56,7 +55,7 @@ class Training(BaseModel): resources_per_worker: RayResourceConfig accelerate_mode: str mixed_precision: str = "no" - gradient_accumulation_steps: int + gradient_accumulation_steps: int = 1 logging_steps: int = 10 @validator("device") @@ -73,6 +72,13 @@ def check_accelerate_mode(cls, v: str): raise ValueError(f"accelerate_mode must be one of {modes}") return v + @validator("mixed_precision") + def check_mixed_precision(cls, v: str): + supported_precisions = ["no", "fp16", "bf16"] + if v not in supported_precisions: + raise ValueError(f"mixed_precision must be one of {supported_precisions}") + return v + @validator("logging_steps") def check_logging_steps(cls, v: int): assert v > 0 diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml index c2999ce7a..e96ee51fb 100644 --- a/finetune/models/bloom-560m.yaml +++ b/finetune/models/bloom-560m.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml index f0092022d..4f27b2ebd 100644 --- a/finetune/models/finetune_config_template.yaml +++ b/finetune/models/finetune_config_template.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml index f0092022d..4f27b2ebd 100644 --- a/finetune/models/gpt-j-6b.yaml +++ b/finetune/models/gpt-j-6b.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml index 0f9dbf9a8..03048ee29 100644 --- a/finetune/models/gpt2.yaml +++ b/finetune/models/gpt2.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml index 56348b2d1..d9a304ee4 100644 --- a/finetune/models/llama-2-7b-chat-hf.yaml +++ b/finetune/models/llama-2-7b-chat-hf.yaml @@ -11,6 +11,10 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + target_modules: + - q_proj + - v_proj + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +32,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml index 8e3eec5ce..b7003c31b 100644 --- a/finetune/models/mistral-7b-v0.1.yaml +++ b/finetune/models/mistral-7b-v0.1.yaml @@ -20,6 +20,7 @@ General: - up_proj - down_proj - lm_head + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -37,3 +38,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml index e8f04d209..e84c4116a 100644 --- a/finetune/models/mpt-7b-chat.yaml +++ b/finetune/models/mpt-7b-chat.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10