-
Notifications
You must be signed in to change notification settings - Fork 36
Revert "Revert "some changes to support fine-tuning on Intel GPU (#88… #99
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,8 +85,6 @@ def _get_lr_scheduler( | |
| num_steps_per_epoch, | ||
| accelerator, | ||
| ): | ||
| # gradient_accumulation_steps = accelerator.gradient_accumulation_steps | ||
| # num_update_steps_per_epoch = math.ceil(num_steps_per_epoch / gradient_accumulation_steps) | ||
| enable = lr_scheduler_config.get("enable", False) | ||
| if not enable: | ||
| return None | ||
|
|
@@ -153,7 +151,7 @@ def prepare(self, model, tokenizer, dataset, optimizer, accelerator): | |
| def train(self): | ||
| num_train_epochs = self.config.get("num_train_epochs", 1) | ||
| checkpoint = self.config.get("checkpoint") | ||
| log_step = self.config.get("log_step", 1) | ||
| logging_steps = self.config.get("logging_steps", 1) | ||
| max_train_step = self.config.get("max_train_step") | ||
| max_eval_step = self.config.get("max_eval_step") | ||
| for idx in range(self.starting_epoch, num_train_epochs, 1): | ||
|
|
@@ -170,12 +168,17 @@ def train(self): | |
| if self.lr_scheduler is not None: | ||
| self.lr_scheduler.step() | ||
| self.optimizer.zero_grad() | ||
| if step % log_step == 0: | ||
|
|
||
| if step % logging_steps == 0: | ||
| loss = loss.item() | ||
| ppl = math.exp(loss) | ||
| logger.info( | ||
| f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{math.exp(loss):.6f}\ttime:{time.time()-start:.6f}" | ||
| f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}" | ||
| ) | ||
| report( | ||
| { | ||
| "loss": loss, | ||
| "ppl": ppl, | ||
| "train_epoch": idx, | ||
| "total_epochs": num_train_epochs, | ||
| "train_step": step, | ||
|
|
@@ -184,6 +187,10 @@ def train(self): | |
| else total_steps, | ||
| } | ||
| ) | ||
| self.accelerator.log( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to use Ray's report or accelerator.log to log the metrics. Currently the code above logs the metrics twice, right? If Ray's report already meets our requirements, I think we don't need to use accelerator.log to log again? |
||
| {"train loss": loss, "train perplexity": ppl}, | ||
| step=idx * total_steps + step, | ||
| ) | ||
| start = time.time() | ||
| if max_train_step is not None: | ||
| if step >= max_train_step - 1: | ||
|
|
@@ -214,6 +221,9 @@ def train(self): | |
| except OverflowError: | ||
| eval_loss = float("inf") | ||
| perplexity = float("inf") | ||
| self.accelerator.log( | ||
| {"evaluate loss": eval_loss, "evaluate perplexity": perplexity} | ||
| ) | ||
| logger.info( | ||
| f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]" | ||
| ) | ||
|
|
@@ -232,6 +242,9 @@ def train(self): | |
| save_function=self.accelerator.save, | ||
| ) | ||
| logger.info(f"finish save model to {output}") | ||
|
|
||
| self.accelerator.end_training() | ||
|
|
||
| self.accelerator.wait_for_everyone() | ||
|
|
||
| def _get_local_path(self, root_path, model_name): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -10,9 +10,11 @@ The following are the parameters supported in the finetuning workflow. | |
| |gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.| | ||
| |output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model| | ||
| |checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint| | ||
| |tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers| | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we directly use the output_dir + "tracking" as the directory and not add this new parameter? |
||
| |config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method| | ||
| |lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.| | ||
| |deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.| | ||
| |enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime| | ||
|
|
||
|
|
||
| ## Dataset Parameters | ||
|
|
@@ -40,3 +42,4 @@ The following are the parameters supported in the finetuning workflow. | |
| |max_train_steps|None|Total number of training steps to perform. If provided, overrides epochs.| | ||
| |gradient_accumulation_steps|1|Number of updates steps to accumulate before performing a backward/update pass.| | ||
| |seed|None|A seed for reproducible training.| | ||
| |logging_steps|10|logging per steps| | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |
| import argparse | ||
| from typing import Any, Dict, Union | ||
|
|
||
| import torch | ||
| import accelerate | ||
| from accelerate.utils import is_xpu_available | ||
|
|
||
|
|
@@ -62,6 +63,14 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any], | |
| return mode_env_vars[mode] | ||
|
|
||
|
|
||
| def convert_dtype(dtype: str) -> torch.dtype: | ||
| supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You passed mixed_precision as the parameter, its value could be "no", "fp16", "bf16" or "fp8". But "no" and "fp8" are not properly handled here.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated. |
||
| if dtype in supported_dtypes: | ||
| return supported_dtypes[dtype] | ||
| else: | ||
| raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add the check in finetune_config.py instead of here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, will update in other PR. |
||
|
|
||
|
|
||
| def train_func(config: Dict[str, Any]): | ||
| cwd = config.get("cwd") | ||
| if cwd: | ||
|
|
@@ -79,9 +88,26 @@ def train_func(config: Dict[str, Any]): | |
| ) | ||
| else: | ||
| fsdp_plugin = None | ||
|
|
||
| log_with = "tensorboard" # only support tensorboard as tracker | ||
| output_dir = config["General"]["output_dir"] | ||
| tracking_dir = config["General"]["tracking_dir"] | ||
| accelerator = accelerate.Accelerator( | ||
| gradient_accumulation_steps=gradient_accumulation_steps, fsdp_plugin=fsdp_plugin | ||
| gradient_accumulation_steps=gradient_accumulation_steps, | ||
| fsdp_plugin=fsdp_plugin, | ||
| log_with=log_with, | ||
| project_dir=tracking_dir, | ||
| ) | ||
| epochs = config["Training"]["epochs"] | ||
| tracker_config = { | ||
| "epochs": epochs, | ||
| "learning_rate": config["Training"]["learning_rate"], | ||
| "batch_size": config["Training"]["batch_size"], | ||
| } | ||
| base_model = config["General"]["base_model"] | ||
| dataset_file = config["Dataset"]["train_file"] | ||
| accelerator.init_trackers("fine-tuning", config=tracker_config) | ||
|
|
||
| common.logger.info( | ||
| f"accelerator generate finish, accelerator device type = {accelerator.device}" | ||
| ) | ||
|
|
@@ -92,23 +118,25 @@ def train_func(config: Dict[str, Any]): | |
|
|
||
| datasets = common.dataset.Dataset.registory.get("HuggingfaceDataset")()( | ||
| config={ | ||
| "name": config["Dataset"]["train_file"], | ||
| "name": dataset_file, | ||
| "validation_file": config["Dataset"]["validation_file"], | ||
| "validation_split_percentage": config["Dataset"]["validation_split_percentage"], | ||
| } | ||
| ) | ||
|
|
||
| tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()( | ||
| config={ | ||
| "name": config["General"]["base_model"], | ||
| "name": base_model, | ||
| "config": config["General"]["config"], | ||
| } | ||
| ) | ||
|
|
||
| model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()( | ||
| config={ | ||
| "name": config["General"]["base_model"], | ||
| "name": base_model, | ||
| "dtype": convert_dtype(config["Training"]["mixed_precision"]), | ||
| "config": config["General"]["config"], | ||
| "enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"], | ||
| "lora_config": config["General"]["lora_config"] | ||
| if config["General"].get("lora_config") | ||
| else None, | ||
|
|
@@ -125,10 +153,10 @@ def train_func(config: Dict[str, Any]): | |
|
|
||
| trainer = common.trainer.Trainer.registory.get("DefaultTrainer")( | ||
| config={ | ||
| "num_train_epochs": config["Training"]["epochs"], | ||
| "num_train_epochs": epochs, | ||
| "max_train_step": config["Training"].get("max_train_steps", None), | ||
| "log_step": 1, | ||
| "output": config["General"]["output_dir"], | ||
| "logging_steps": config["Training"].get("logging_steps", 1), | ||
| "output": output_dir, | ||
| "dataprocesser": { | ||
| "type": "GeneralProcesser", | ||
| "per_device_train_batch_size": config["Training"]["batch_size"], | ||
|
|
@@ -217,14 +245,21 @@ def main(external_config=None): | |
| "FI_PROVIDER": "tcp", | ||
| } | ||
| } | ||
|
|
||
| accelerate_env_vars = get_accelerate_environment_variable(accelerate_mode, config) | ||
| runtime_env["env_vars"].update(accelerate_env_vars) | ||
|
|
||
| if config["General"]["gpt_base_model"] is True: | ||
| runtime_env["pip"] = ["transformers==4.26.0"] | ||
|
|
||
| ray.init(runtime_env=runtime_env) | ||
| import intel_extension_for_pytorch as ipex | ||
|
|
||
| if "xpu" in ipex.__version__: | ||
| num_cpus = ( | ||
| resources_per_worker["CPU"] * num_training_workers + 1 | ||
| ) # additional 1 for head worker | ||
| ray.init(num_cpus=num_cpus, runtime_env=runtime_env) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need this change and can we avoid this? If we start Ray first, then execute the finetune command, do we still need this change and does this change still work?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is a workaround solution. |
||
| else: | ||
| ray.init(runtime_env=runtime_env) | ||
|
|
||
| common.logger.info(f"ray available resources = {ray.available_resources()}") | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ General: | |
| gpt_base_model: true | ||
| output_dir: /tmp/llm-ray/output | ||
| checkpoint_dir: /tmp/llm-ray/checkpoint | ||
| tracking_dir: /tmp/llm-ray/tracking | ||
| config: | ||
| trust_remote_code: false | ||
| use_auth_token: null | ||
|
|
@@ -11,6 +12,7 @@ General: | |
| r: 8 | ||
| lora_alpha: 32 | ||
| lora_dropout: 0.1 | ||
| enable_gradient_checkpointing: false | ||
| Dataset: | ||
| train_file: examples/data/sample_finetune_data_small.jsonl | ||
| validation_file: null | ||
|
|
@@ -28,3 +30,5 @@ Training: | |
| resources_per_worker: | ||
| CPU: 32 | ||
| accelerate_mode: CPU_DDP | ||
| gradient_accumulation_steps: 2 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The default value is 1 in our document. can you please set to 1 here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, will update in other PR. |
||
| logging_steps: 10 | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,9 +26,11 @@ class General(BaseModel): | |
| gpt_base_model: bool | ||
| output_dir: str | ||
| checkpoint_dir: str | ||
| tracking_dir: str | ||
| config: GeneralConfig | ||
| lora_config: Optional[LoraConfig] = None | ||
| deltatuner_config: Optional[DeltatunerConfig] = None | ||
| enable_gradient_checkpointing: bool = False | ||
|
|
||
|
|
||
| class Dataset(BaseModel): | ||
|
|
@@ -54,6 +56,8 @@ class Training(BaseModel): | |
| resources_per_worker: RayResourceConfig | ||
| accelerate_mode: str | ||
| mixed_precision: str = "no" | ||
| gradient_accumulation_steps: int | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you set the default value 1 here?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, will update in other PR. |
||
| logging_steps: int = 10 | ||
|
|
||
| @validator("device") | ||
| def check_device(cls, v: str): | ||
|
|
@@ -69,6 +73,11 @@ def check_accelerate_mode(cls, v: str): | |
| raise ValueError(f"accelerate_mode must be one of {modes}") | ||
| return v | ||
|
|
||
| @validator("logging_steps") | ||
| def check_logging_steps(cls, v: int): | ||
| assert v > 0 | ||
| return v | ||
|
|
||
| # @model_validator(mode='after') | ||
| # def check_device_and_accelerate_mode(self) -> "Training": | ||
| # dev = self.device | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of just output 0, 1, 2, etc, can we support output it like 0.1, 0.2, etc just like other workflows?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK, will update in other PR.