From db8ffc7178390983fa43276d152eccc44f06840b Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 5 Feb 2024 16:32:06 +0000 Subject: [PATCH 01/10] fixes --- common/dataset/huggingface_dataset.py | 2 +- common/trainer/default_trainer.py | 5 +++-- finetune/finetune.py | 16 +++++++++------- finetune/finetune.yaml | 2 +- finetune/finetune_config.py | 9 ++++++++- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/common/dataset/huggingface_dataset.py b/common/dataset/huggingface_dataset.py index 9173e067f..3b9214aaf 100644 --- a/common/dataset/huggingface_dataset.py +++ b/common/dataset/huggingface_dataset.py @@ -25,7 +25,7 @@ def __call__(self, config): if validation_file is not None: validation_dataset = local_load(validation_file) return datasets.DatasetDict( - {"train": train_dataset, "validation_dataset": validation_dataset} + {"train": train_dataset, "validation": validation_dataset} ) if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0: datasets_dict = train_dataset.train_test_split( diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py index f05c6317f..7d76aafa6 100644 --- a/common/trainer/default_trainer.py +++ b/common/trainer/default_trainer.py @@ -155,10 +155,10 @@ def train(self): max_train_step = self.config.get("max_train_step") max_eval_step = self.config.get("max_eval_step") for idx in range(self.starting_epoch, num_train_epochs, 1): - logger.info(f"start train epoch {idx}") self.model.train() start = time.time() total_steps = len(self.train_dataloader) + logger.info(f"start train epoch {idx}, total_steps {total_steps}") for step, batch in enumerate(self.train_dataloader): with self.accelerator.accumulate(self.model): outputs = self.model(**batch) @@ -172,8 +172,9 @@ def train(self): if step % logging_steps == 0: loss = loss.item() ppl = math.exp(loss) + epochs = (step + idx * total_steps) / (num_train_epochs * total_steps) logger.info( - f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}" + f"train epoch:{epochs:.6f}\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}" ) report( { diff --git a/finetune/finetune.py b/finetune/finetune.py index 7ab0183db..e79cd3293 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -2,7 +2,7 @@ import os import argparse -from typing import Any, Dict, Union +from typing import Any, Dict, Union, Optional import torch import accelerate @@ -63,12 +63,14 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any], return mode_env_vars[mode] -def convert_dtype(dtype: str) -> torch.dtype: - supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} - if dtype in supported_dtypes: - return supported_dtypes[dtype] - else: - raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]") +def convert_dtype(dtype: str) -> Optional[torch.dtype]: + supported_dtypes = { + "fp16": torch.float16, + "bf16": torch.bfloat16, + "fp32": torch.float32, + "no": None, + } + return supported_dtypes[dtype] def train_func(config: Dict[str, Any]): diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml index 285520d82..ed58b4deb 100644 --- a/finetune/finetune.yaml +++ b/finetune/finetune.yaml @@ -30,5 +30,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP - gradient_accumulation_steps: 2 + gradient_accumulation_steps: 1 logging_steps: 10 diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py index 6a269b7ee..e15e6051c 100644 --- a/finetune/finetune_config.py +++ b/finetune/finetune_config.py @@ -56,7 +56,7 @@ class Training(BaseModel): resources_per_worker: RayResourceConfig accelerate_mode: str mixed_precision: str = "no" - gradient_accumulation_steps: int + gradient_accumulation_steps: int = 1 logging_steps: int = 10 @validator("device") @@ -73,6 +73,13 @@ def check_accelerate_mode(cls, v: str): raise ValueError(f"accelerate_mode must be one of {modes}") return v + @validator("mixed_precision") + def check_mixed_precision(cls, v: str): + supported_precisions = ["no", "fp16", "bf16", "fp32"] + if v not in supported_precisions: + raise ValueError(f"mixed_precision must be on of {supported_precisions}") + return v + @validator("logging_steps") def check_logging_steps(cls, v: int): assert v > 0 From f215f4ad4a59d164ef27f2813a22246e8697fff5 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 5 Feb 2024 18:24:28 +0000 Subject: [PATCH 02/10] update --- .github/workflows/workflow_finetune_gpu.yml | 8 ++++---- examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml | 5 +++++ finetune/models/gpt-j-6b.yaml | 4 ++++ finetune/models/gpt2.yaml | 4 ++++ finetune/models/llama-2-7b-chat-hf.yaml | 11 +++++++++-- finetune/models/mistral-7b-v0.1.yaml | 8 ++++++-- finetune/models/mpt-7b-chat.yaml | 4 ++++ 7 files changed, 36 insertions(+), 8 deletions(-) diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml index 561522c27..416cbd458 100644 --- a/.github/workflows/workflow_finetune_gpu.yml +++ b/.github/workflows/workflow_finetune_gpu.yml @@ -8,17 +8,17 @@ on: default: '10.1.2.13:5000/llmray-build' http_proxy: type: string - default: 'http://proxy-chain.intel.com:911' + default: 'http://10.24.221.149:911' https_proxy: type: string - default: 'http://proxy-chain.intel.com:911' + default: 'http://10.24.221.149:911' jobs: finetune: name: finetune on gpu test strategy: matrix: - model: [ pythia-6.9b, gpt-j-6b ] + model: [ meta-llama/Llama-2-7b-chat-hf, mistralai/Mistral-7B-v0.1 ] runs-on: self-hosted defaults: @@ -43,4 +43,4 @@ jobs: cd ~/borealis-runner/ python3 finetune_on_pvc.py --base_model "${{ matrix.model }}" - name: Test Summary - run: echo "to be continued" \ No newline at end of file + run: echo "to be continued" diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml index 41303f615..78d126b55 100644 --- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml +++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml @@ -3,6 +3,7 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint + tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null @@ -11,6 +12,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data.jsonl validation_file: null @@ -22,9 +24,12 @@ Training: learning_rate: 1.0e-05 lr_scheduler: linear weight_decay: 0.0 + mixed_precision: bf16 device: GPU num_training_workers: 2 accelerate_mode: GPU_DDP resources_per_worker: CPU: 1 GPU: 1 + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml index f0092022d..ed58b4deb 100644 --- a/finetune/models/gpt-j-6b.yaml +++ b/finetune/models/gpt-j-6b.yaml @@ -3,6 +3,7 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint + tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null @@ -11,6 +12,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +30,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml index 0f9dbf9a8..e484a05e2 100644 --- a/finetune/models/gpt2.yaml +++ b/finetune/models/gpt2.yaml @@ -3,6 +3,7 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint + tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null @@ -11,6 +12,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +30,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml index 56348b2d1..2b45ad6f7 100644 --- a/finetune/models/llama-2-7b-chat-hf.yaml +++ b/finetune/models/llama-2-7b-chat-hf.yaml @@ -1,8 +1,9 @@ General: base_model: meta-llama/Llama-2-7b-chat-hf gpt_base_model: false - output_dir: /tmp/llm-ray/output - checkpoint_dir: /tmp/llm-ray/checkpoint + output_dir: ./output + checkpoint_dir: ./checkpoint + tracking_dir: ./tracking config: trust_remote_code: false use_auth_token: null @@ -11,6 +12,10 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + target_modules: + - q_proj + - v_proj + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +33,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml index 8e3eec5ce..7196aec82 100644 --- a/finetune/models/mistral-7b-v0.1.yaml +++ b/finetune/models/mistral-7b-v0.1.yaml @@ -1,8 +1,9 @@ General: base_model: mistralai/Mistral-7B-v0.1 gpt_base_model: false - output_dir: /tmp/llm-ray/output - checkpoint_dir: /tmp/llm-ray/checkpoint + output_dir: ./output + checkpoint_dir: ./checkpoint + tracking_dir: ./tracking config: trust_remote_code: false use_auth_token: null @@ -20,6 +21,7 @@ General: - up_proj - down_proj - lm_head + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -37,3 +39,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml index e8f04d209..8218b9ad6 100644 --- a/finetune/models/mpt-7b-chat.yaml +++ b/finetune/models/mpt-7b-chat.yaml @@ -3,6 +3,7 @@ General: gpt_base_model: false output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint + tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: true use_auth_token: null @@ -11,6 +12,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +30,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 From c46650d6b477ca35c0703e3d5c08822ac35f68e5 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 5 Feb 2024 18:27:25 +0000 Subject: [PATCH 03/10] update --- common/trainer/default_trainer.py | 2 +- finetune/finetune_config.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py index 7d76aafa6..4aa13b6eb 100644 --- a/common/trainer/default_trainer.py +++ b/common/trainer/default_trainer.py @@ -158,7 +158,7 @@ def train(self): self.model.train() start = time.time() total_steps = len(self.train_dataloader) - logger.info(f"start train epoch {idx}, total_steps {total_steps}") + logger.info(f"Start training epoch {idx}, total_steps {total_steps}") for step, batch in enumerate(self.train_dataloader): with self.accelerator.accumulate(self.model): outputs = self.model(**batch) diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py index e15e6051c..779a488c7 100644 --- a/finetune/finetune_config.py +++ b/finetune/finetune_config.py @@ -77,7 +77,7 @@ def check_accelerate_mode(cls, v: str): def check_mixed_precision(cls, v: str): supported_precisions = ["no", "fp16", "bf16", "fp32"] if v not in supported_precisions: - raise ValueError(f"mixed_precision must be on of {supported_precisions}") + raise ValueError(f"mixed_precision must be one of {supported_precisions}") return v @validator("logging_steps") From 02535dacc25b1f95bb8a3bd2bdae73b75ddcf128 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 5 Feb 2024 18:43:03 +0000 Subject: [PATCH 04/10] update --- docs/finetune_parameters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index a10bb8c33..6c9a5dbd3 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -34,7 +34,7 @@ The following are the parameters supported in the finetuning workflow. |learning_rate|1e-5|Initial learning rate to use.| |lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"| |weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.| -|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set. +|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp32". Default is "no" if not set. |device|CPU|The device type used, can be "CPU", "GPU".| |num_training_workers|2|The number of the training process.| |resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.| From c494983709660234962647f5eb463e36e1160ca3 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Tue, 6 Feb 2024 13:16:16 +0000 Subject: [PATCH 05/10] remove accelrator tracking, replace by ray train.report --- common/trainer/default_trainer.py | 13 ++----------- docs/finetune_parameters.md | 1 - examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml | 1 - finetune/finetune.py | 10 ---------- finetune/finetune.yaml | 1 - finetune/finetune_config.py | 1 - finetune/models/gpt-j-6b.yaml | 1 - finetune/models/gpt2.yaml | 1 - finetune/models/llama-2-7b-chat-hf.yaml | 1 - finetune/models/mistral-7b-v0.1.yaml | 1 - finetune/models/mpt-7b-chat.yaml | 1 - 11 files changed, 2 insertions(+), 30 deletions(-) diff --git a/common/trainer/default_trainer.py b/common/trainer/default_trainer.py index 4aa13b6eb..5ed24f677 100644 --- a/common/trainer/default_trainer.py +++ b/common/trainer/default_trainer.py @@ -178,8 +178,8 @@ def train(self): ) report( { - "loss": loss, - "ppl": ppl, + "train_loss": loss, + "train_ppl": ppl, "train_epoch": idx, "total_epochs": num_train_epochs, "train_step": step, @@ -188,10 +188,6 @@ def train(self): else total_steps, } ) - self.accelerator.log( - {"train loss": loss, "train perplexity": ppl}, - step=idx * total_steps + step, - ) start = time.time() if max_train_step is not None: if step >= max_train_step - 1: @@ -222,9 +218,6 @@ def train(self): except OverflowError: eval_loss = float("inf") perplexity = float("inf") - self.accelerator.log( - {"evaluate loss": eval_loss, "evaluate perplexity": perplexity} - ) logger.info( f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]" ) @@ -244,8 +237,6 @@ def train(self): ) logger.info(f"finish save model to {output}") - self.accelerator.end_training() - self.accelerator.wait_for_everyone() def _get_local_path(self, root_path, model_name): diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index 6c9a5dbd3..00168767b 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -10,7 +10,6 @@ The following are the parameters supported in the finetuning workflow. |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.| |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model| |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint| -|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers| |config|trust_remote_code: False
use_auth_token: None|Will be passed to the transformers `from_pretrained()` method| |lora_config|task_type: CAUSAL_LM
r: 8
lora_alpha: 32
lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| |deltatuner_config|"algo": "lora"
"denas": True
"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| diff --git a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml index 78d126b55..3cfa96913 100644 --- a/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml +++ b/examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint - tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null diff --git a/finetune/finetune.py b/finetune/finetune.py index e79cd3293..98586aecc 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -91,24 +91,14 @@ def train_func(config: Dict[str, Any]): else: fsdp_plugin = None - log_with = "tensorboard" # only support tensorboard as tracker output_dir = config["General"]["output_dir"] - tracking_dir = config["General"]["tracking_dir"] accelerator = accelerate.Accelerator( gradient_accumulation_steps=gradient_accumulation_steps, fsdp_plugin=fsdp_plugin, - log_with=log_with, - project_dir=tracking_dir, ) epochs = config["Training"]["epochs"] - tracker_config = { - "epochs": epochs, - "learning_rate": config["Training"]["learning_rate"], - "batch_size": config["Training"]["batch_size"], - } base_model = config["General"]["base_model"] dataset_file = config["Dataset"]["train_file"] - accelerator.init_trackers("fine-tuning", config=tracker_config) common.logger.info( f"accelerator generate finish, accelerator device type = {accelerator.device}" diff --git a/finetune/finetune.yaml b/finetune/finetune.yaml index ed58b4deb..4f27b2ebd 100644 --- a/finetune/finetune.yaml +++ b/finetune/finetune.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint - tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py index 779a488c7..819e8b1d3 100644 --- a/finetune/finetune_config.py +++ b/finetune/finetune_config.py @@ -26,7 +26,6 @@ class General(BaseModel): gpt_base_model: bool output_dir: str checkpoint_dir: str - tracking_dir: str config: GeneralConfig lora_config: Optional[LoraConfig] = None deltatuner_config: Optional[DeltatunerConfig] = None diff --git a/finetune/models/gpt-j-6b.yaml b/finetune/models/gpt-j-6b.yaml index ed58b4deb..4f27b2ebd 100644 --- a/finetune/models/gpt-j-6b.yaml +++ b/finetune/models/gpt-j-6b.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint - tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null diff --git a/finetune/models/gpt2.yaml b/finetune/models/gpt2.yaml index e484a05e2..03048ee29 100644 --- a/finetune/models/gpt2.yaml +++ b/finetune/models/gpt2.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: true output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint - tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: false use_auth_token: null diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml index 2b45ad6f7..014c06b0b 100644 --- a/finetune/models/llama-2-7b-chat-hf.yaml +++ b/finetune/models/llama-2-7b-chat-hf.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: false output_dir: ./output checkpoint_dir: ./checkpoint - tracking_dir: ./tracking config: trust_remote_code: false use_auth_token: null diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml index 7196aec82..78f355b37 100644 --- a/finetune/models/mistral-7b-v0.1.yaml +++ b/finetune/models/mistral-7b-v0.1.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: false output_dir: ./output checkpoint_dir: ./checkpoint - tracking_dir: ./tracking config: trust_remote_code: false use_auth_token: null diff --git a/finetune/models/mpt-7b-chat.yaml b/finetune/models/mpt-7b-chat.yaml index 8218b9ad6..e84c4116a 100644 --- a/finetune/models/mpt-7b-chat.yaml +++ b/finetune/models/mpt-7b-chat.yaml @@ -3,7 +3,6 @@ General: gpt_base_model: false output_dir: /tmp/llm-ray/output checkpoint_dir: /tmp/llm-ray/checkpoint - tracking_dir: /tmp/llm-ray/tracking config: trust_remote_code: true use_auth_token: null From ae3efbe50e913c0690ad0e7dbd7d42859a8549ce Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Tue, 6 Feb 2024 15:46:23 +0000 Subject: [PATCH 06/10] update --- finetune/finetune.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/finetune/finetune.py b/finetune/finetune.py index 98586aecc..c64eeeb4f 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -126,9 +126,11 @@ def train_func(config: Dict[str, Any]): model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()( config={ "name": base_model, - "dtype": convert_dtype(config["Training"]["mixed_precision"]), + "dtype": convert_dtype(config["Training"].get("mixed_precision", "no")), "config": config["General"]["config"], - "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"], + "enable_gradient_checkpointing": config["General"].get( + "enable_gradient_checkpointing", "no" + ), "lora_config": config["General"]["lora_config"] if config["General"].get("lora_config") else None, From 59c757e9817266124bf6747c52be87a29d385b20 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Mon, 19 Feb 2024 16:04:48 +0000 Subject: [PATCH 07/10] update --- docs/finetune_parameters.md | 2 +- finetune/finetune.py | 1 - finetune/finetune_config.py | 2 +- finetune/models/bloom-560m.yaml | 3 +++ finetune/models/finetune_config_template.yaml | 3 +++ finetune/models/llama-2-7b-chat-hf.yaml | 4 ++-- finetune/models/mistral-7b-v0.1.yaml | 4 ++-- 7 files changed, 12 insertions(+), 7 deletions(-) diff --git a/docs/finetune_parameters.md b/docs/finetune_parameters.md index 00168767b..69a906e86 100644 --- a/docs/finetune_parameters.md +++ b/docs/finetune_parameters.md @@ -33,7 +33,7 @@ The following are the parameters supported in the finetuning workflow. |learning_rate|1e-5|Initial learning rate to use.| |lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"| |weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.| -|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp32". Default is "no" if not set. +|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16". Default is "no" if not set. |device|CPU|The device type used, can be "CPU", "GPU".| |num_training_workers|2|The number of the training process.| |resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.| diff --git a/finetune/finetune.py b/finetune/finetune.py index c64eeeb4f..577695c8c 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -67,7 +67,6 @@ def convert_dtype(dtype: str) -> Optional[torch.dtype]: supported_dtypes = { "fp16": torch.float16, "bf16": torch.bfloat16, - "fp32": torch.float32, "no": None, } return supported_dtypes[dtype] diff --git a/finetune/finetune_config.py b/finetune/finetune_config.py index 819e8b1d3..c24114394 100644 --- a/finetune/finetune_config.py +++ b/finetune/finetune_config.py @@ -74,7 +74,7 @@ def check_accelerate_mode(cls, v: str): @validator("mixed_precision") def check_mixed_precision(cls, v: str): - supported_precisions = ["no", "fp16", "bf16", "fp32"] + supported_precisions = ["no", "fp16", "bf16"] if v not in supported_precisions: raise ValueError(f"mixed_precision must be one of {supported_precisions}") return v diff --git a/finetune/models/bloom-560m.yaml b/finetune/models/bloom-560m.yaml index c2999ce7a..e96ee51fb 100644 --- a/finetune/models/bloom-560m.yaml +++ b/finetune/models/bloom-560m.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/finetune_config_template.yaml b/finetune/models/finetune_config_template.yaml index f0092022d..4f27b2ebd 100644 --- a/finetune/models/finetune_config_template.yaml +++ b/finetune/models/finetune_config_template.yaml @@ -11,6 +11,7 @@ General: r: 8 lora_alpha: 32 lora_dropout: 0.1 + enable_gradient_checkpointing: false Dataset: train_file: examples/data/sample_finetune_data_small.jsonl validation_file: null @@ -28,3 +29,5 @@ Training: resources_per_worker: CPU: 32 accelerate_mode: CPU_DDP + gradient_accumulation_steps: 1 + logging_steps: 10 diff --git a/finetune/models/llama-2-7b-chat-hf.yaml b/finetune/models/llama-2-7b-chat-hf.yaml index 014c06b0b..d9a304ee4 100644 --- a/finetune/models/llama-2-7b-chat-hf.yaml +++ b/finetune/models/llama-2-7b-chat-hf.yaml @@ -1,8 +1,8 @@ General: base_model: meta-llama/Llama-2-7b-chat-hf gpt_base_model: false - output_dir: ./output - checkpoint_dir: ./checkpoint + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint config: trust_remote_code: false use_auth_token: null diff --git a/finetune/models/mistral-7b-v0.1.yaml b/finetune/models/mistral-7b-v0.1.yaml index 78f355b37..b7003c31b 100644 --- a/finetune/models/mistral-7b-v0.1.yaml +++ b/finetune/models/mistral-7b-v0.1.yaml @@ -1,8 +1,8 @@ General: base_model: mistralai/Mistral-7B-v0.1 gpt_base_model: false - output_dir: ./output - checkpoint_dir: ./checkpoint + output_dir: /tmp/llm-ray/output + checkpoint_dir: /tmp/llm-ray/checkpoint config: trust_remote_code: false use_auth_token: null From eba891ee7a4f94bd283641e0d81b5cc5e0707622 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Tue, 20 Feb 2024 09:10:08 +0000 Subject: [PATCH 08/10] update --- .../config/update_finetune_config_on_intel_gpu.py | 12 ++++++------ .github/workflows/workflow_finetune_gpu.yml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py index e46dda811..eb19ad35b 100644 --- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py +++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py @@ -2,9 +2,8 @@ import argparse -def update_finetune_config(base_model): - conf_file = "finetune/finetune.yaml" - with open(conf_file) as f: +def update_finetune_config(config_file, base_model): + with open(config_file) as f: config = yaml.load(f, Loader=yaml.FullLoader) # due to compute node can't connect network # base models are downloaded as local files in directory ~/models/ @@ -23,18 +22,19 @@ def update_finetune_config(base_model): # pythia-6.9b config["General"]["base_model"] = base_model - # config["General"]["base_model"] = "pythia-70m" config["Training"]["device"] = "GPU" config["Training"]["resources_per_worker"]["CPU"] = 1 config["Training"]["resources_per_worker"]["GPU"] = 1 config["Training"]["accelerate_mode"] = "GPU_DDP" + config["Training"]["logging_steps"] = 1 - with open(conf_file, "w") as f: + with open(config_file, "w") as f: yaml.dump(config, f, sort_keys=False) def get_parser(): parser = argparse.ArgumentParser(description="Finetuning on Intel GPU") + parser.add_argument("--config_file", type=str, required=True, default=None) parser.add_argument("--base_model", type=str, required=True, default=None) return parser @@ -43,4 +43,4 @@ def get_parser(): parser = get_parser() args = parser.parse_args() - update_finetune_config(args.base_model) + update_finetune_config(args.config_file, args.base_model) diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml index 416cbd458..57a32bc06 100644 --- a/.github/workflows/workflow_finetune_gpu.yml +++ b/.github/workflows/workflow_finetune_gpu.yml @@ -18,7 +18,7 @@ jobs: name: finetune on gpu test strategy: matrix: - model: [ meta-llama/Llama-2-7b-chat-hf, mistralai/Mistral-7B-v0.1 ] + model: [ meta-llama/Llama-2-7b-chat-hf ] runs-on: self-hosted defaults: From 884d1f31fc1daba731a95c4ab763d875f6d778b5 Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Tue, 20 Feb 2024 16:37:41 +0000 Subject: [PATCH 09/10] update --- .github/workflows/config/update_finetune_config_on_intel_gpu.py | 2 ++ .github/workflows/workflow_finetune_gpu.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/config/update_finetune_config_on_intel_gpu.py b/.github/workflows/config/update_finetune_config_on_intel_gpu.py index eb19ad35b..38d49143b 100644 --- a/.github/workflows/config/update_finetune_config_on_intel_gpu.py +++ b/.github/workflows/config/update_finetune_config_on_intel_gpu.py @@ -22,6 +22,8 @@ def update_finetune_config(config_file, base_model): # pythia-6.9b config["General"]["base_model"] = base_model + config["General"]["output_dir"] = "./output" + config["General"]["checkpoint_dir"] = "./checkpoint" config["Training"]["device"] = "GPU" config["Training"]["resources_per_worker"]["CPU"] = 1 config["Training"]["resources_per_worker"]["GPU"] = 1 diff --git a/.github/workflows/workflow_finetune_gpu.yml b/.github/workflows/workflow_finetune_gpu.yml index 57a32bc06..4dcb43da3 100644 --- a/.github/workflows/workflow_finetune_gpu.yml +++ b/.github/workflows/workflow_finetune_gpu.yml @@ -41,6 +41,6 @@ jobs: rm ~/borealis-runner/llm-on-ray.tar.gz -f tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray . cd ~/borealis-runner/ - python3 finetune_on_pvc.py --base_model "${{ matrix.model }}" + python3 finetune_on_pvc.py --need_create_conda_env true --base_models "${{ matrix.model }}" - name: Test Summary run: echo "to be continued" From 685ddbc8dd6b37ce99970c08e1116a4361c5542f Mon Sep 17 00:00:00 2001 From: "Wu, Gangsheng" Date: Wed, 21 Feb 2024 13:44:24 +0000 Subject: [PATCH 10/10] update --- finetune/finetune.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/finetune/finetune.py b/finetune/finetune.py index 577695c8c..f6d2f15c6 100644 --- a/finetune/finetune.py +++ b/finetune/finetune.py @@ -128,7 +128,7 @@ def train_func(config: Dict[str, Any]): "dtype": convert_dtype(config["Training"].get("mixed_precision", "no")), "config": config["General"]["config"], "enable_gradient_checkpointing": config["General"].get( - "enable_gradient_checkpointing", "no" + "enable_gradient_checkpointing", False ), "lora_config": config["General"]["lora_config"] if config["General"].get("lora_config")