Skip to content
This repository was archived by the owner on Sep 23, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions .github/workflows/config/update_finetune_config_on_intel_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import argparse


def update_finetune_config(base_model):
conf_file = "finetune/finetune.yaml"
with open(conf_file) as f:
def update_finetune_config(config_file, base_model):
with open(config_file) as f:
config = yaml.load(f, Loader=yaml.FullLoader)
# due to compute node can't connect network
# base models are downloaded as local files in directory ~/models/
Expand All @@ -23,18 +22,21 @@ def update_finetune_config(base_model):
# pythia-6.9b

config["General"]["base_model"] = base_model
# config["General"]["base_model"] = "pythia-70m"
config["General"]["output_dir"] = "./output"
config["General"]["checkpoint_dir"] = "./checkpoint"
config["Training"]["device"] = "GPU"
config["Training"]["resources_per_worker"]["CPU"] = 1
config["Training"]["resources_per_worker"]["GPU"] = 1
config["Training"]["accelerate_mode"] = "GPU_DDP"
config["Training"]["logging_steps"] = 1

with open(conf_file, "w") as f:
with open(config_file, "w") as f:
yaml.dump(config, f, sort_keys=False)


def get_parser():
parser = argparse.ArgumentParser(description="Finetuning on Intel GPU")
parser.add_argument("--config_file", type=str, required=True, default=None)
parser.add_argument("--base_model", type=str, required=True, default=None)
return parser

Expand All @@ -43,4 +45,4 @@ def get_parser():
parser = get_parser()
args = parser.parse_args()

update_finetune_config(args.base_model)
update_finetune_config(args.config_file, args.base_model)
10 changes: 5 additions & 5 deletions .github/workflows/workflow_finetune_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ on:
default: '10.1.2.13:5000/llmray-build'
http_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'
https_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'

jobs:
finetune:
name: finetune on gpu test
strategy:
matrix:
model: [ pythia-6.9b, gpt-j-6b ]
model: [ meta-llama/Llama-2-7b-chat-hf ]
runs-on: self-hosted

defaults:
Expand All @@ -41,6 +41,6 @@ jobs:
rm ~/borealis-runner/llm-on-ray.tar.gz -f
tar zcf ~/borealis-runner/llm-on-ray.tar.gz -C ~/actions-runner/_work/llm-on-ray .
cd ~/borealis-runner/
python3 finetune_on_pvc.py --base_model "${{ matrix.model }}"
python3 finetune_on_pvc.py --need_create_conda_env true --base_models "${{ matrix.model }}"
- name: Test Summary
run: echo "to be continued"
run: echo "to be continued"
2 changes: 1 addition & 1 deletion common/dataset/huggingface_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def __call__(self, config):
if validation_file is not None:
validation_dataset = local_load(validation_file)
return datasets.DatasetDict(
{"train": train_dataset, "validation_dataset": validation_dataset}
{"train": train_dataset, "validation": validation_dataset}
)
if validation_split_percentage / 100 > 0.0 and validation_split_percentage / 100 < 1.0:
datasets_dict = train_dataset.train_test_split(
Expand Down
18 changes: 5 additions & 13 deletions common/trainer/default_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,10 +155,10 @@ def train(self):
max_train_step = self.config.get("max_train_step")
max_eval_step = self.config.get("max_eval_step")
for idx in range(self.starting_epoch, num_train_epochs, 1):
logger.info(f"start train epoch {idx}")
self.model.train()
start = time.time()
total_steps = len(self.train_dataloader)
logger.info(f"Start training epoch {idx}, total_steps {total_steps}")
for step, batch in enumerate(self.train_dataloader):
with self.accelerator.accumulate(self.model):
outputs = self.model(**batch)
Expand All @@ -172,13 +172,14 @@ def train(self):
if step % logging_steps == 0:
loss = loss.item()
ppl = math.exp(loss)
epochs = (step + idx * total_steps) / (num_train_epochs * total_steps)
logger.info(
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
f"train epoch:{epochs:.6f}\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
)
report(
{
"loss": loss,
"ppl": ppl,
"train_loss": loss,
"train_ppl": ppl,
"train_epoch": idx,
"total_epochs": num_train_epochs,
"train_step": step,
Expand All @@ -187,10 +188,6 @@ def train(self):
else total_steps,
}
)
self.accelerator.log(
{"train loss": loss, "train perplexity": ppl},
step=idx * total_steps + step,
)
start = time.time()
if max_train_step is not None:
if step >= max_train_step - 1:
Expand Down Expand Up @@ -221,9 +218,6 @@ def train(self):
except OverflowError:
eval_loss = float("inf")
perplexity = float("inf")
self.accelerator.log(
{"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
)
logger.info(
f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
)
Expand All @@ -243,8 +237,6 @@ def train(self):
)
logger.info(f"finish save model to {output}")

self.accelerator.end_training()

self.accelerator.wait_for_everyone()

def _get_local_path(self, root_path, model_name):
Expand Down
3 changes: 1 addition & 2 deletions docs/finetune_parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ The following are the parameters supported in the finetuning workflow.
|gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
|output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
|checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
|config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
|lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
|deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
Expand All @@ -34,7 +33,7 @@ The following are the parameters supported in the finetuning workflow.
|learning_rate|1e-5|Initial learning rate to use.|
|lr_scheduler|linear|The scheduler type to use, supported value: "linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"|
|weight_decay|0.0|Weight decay is a regularization technique that adds an L2 norm of all model weights to the loss function while increasing the probability of improving the model generalization.|
|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16" or "fp8". Default is "no" if not set.
|mixed_precision|no|Whether or not to use mixed precision training. Choose from "no", "fp16", "bf16". Default is "no" if not set.
|device|CPU|The device type used, can be "CPU", "GPU".|
|num_training_workers|2|The number of the training process.|
|resources_per_worker|{"CPU": 32}|A dict to specify the resources for each worker. If `device` is "GPU", please set it like {"CPU": 32, "GPU": 1}.|
Expand Down
4 changes: 4 additions & 0 deletions examples/finetune/gpt_j_6b/finetune_intel_gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data.jsonl
validation_file: null
Expand All @@ -22,9 +23,12 @@ Training:
learning_rate: 1.0e-05
lr_scheduler: linear
weight_decay: 0.0
mixed_precision: bf16
device: GPU
num_training_workers: 2
accelerate_mode: GPU_DDP
resources_per_worker:
CPU: 1
GPU: 1
gradient_accumulation_steps: 1
logging_steps: 10
31 changes: 12 additions & 19 deletions finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import os
import argparse
from typing import Any, Dict, Union
from typing import Any, Dict, Union, Optional

import torch
import accelerate
Expand Down Expand Up @@ -63,12 +63,13 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
return mode_env_vars[mode]


def convert_dtype(dtype: str) -> torch.dtype:
supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
if dtype in supported_dtypes:
return supported_dtypes[dtype]
else:
raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")
def convert_dtype(dtype: str) -> Optional[torch.dtype]:
supported_dtypes = {
"fp16": torch.float16,
"bf16": torch.bfloat16,
"no": None,
}
return supported_dtypes[dtype]


def train_func(config: Dict[str, Any]):
Expand All @@ -89,24 +90,14 @@ def train_func(config: Dict[str, Any]):
else:
fsdp_plugin = None

log_with = "tensorboard" # only support tensorboard as tracker
output_dir = config["General"]["output_dir"]
tracking_dir = config["General"]["tracking_dir"]
accelerator = accelerate.Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps,
fsdp_plugin=fsdp_plugin,
log_with=log_with,
project_dir=tracking_dir,
)
epochs = config["Training"]["epochs"]
tracker_config = {
"epochs": epochs,
"learning_rate": config["Training"]["learning_rate"],
"batch_size": config["Training"]["batch_size"],
}
base_model = config["General"]["base_model"]
dataset_file = config["Dataset"]["train_file"]
accelerator.init_trackers("fine-tuning", config=tracker_config)

common.logger.info(
f"accelerator generate finish, accelerator device type = {accelerator.device}"
Expand Down Expand Up @@ -134,9 +125,11 @@ def train_func(config: Dict[str, Any]):
model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
config={
"name": base_model,
"dtype": convert_dtype(config["Training"]["mixed_precision"]),
"dtype": convert_dtype(config["Training"].get("mixed_precision", "no")),
"config": config["General"]["config"],
"enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
"enable_gradient_checkpointing": config["General"].get(
"enable_gradient_checkpointing", False
),
"lora_config": config["General"]["lora_config"]
if config["General"].get("lora_config")
else None,
Expand Down
3 changes: 1 addition & 2 deletions finetune/finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ General:
gpt_base_model: true
output_dir: /tmp/llm-ray/output
checkpoint_dir: /tmp/llm-ray/checkpoint
tracking_dir: /tmp/llm-ray/tracking
config:
trust_remote_code: false
use_auth_token: null
Expand All @@ -30,5 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 2
gradient_accumulation_steps: 1
logging_steps: 10
10 changes: 8 additions & 2 deletions finetune/finetune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ class General(BaseModel):
gpt_base_model: bool
output_dir: str
checkpoint_dir: str
tracking_dir: str
config: GeneralConfig
lora_config: Optional[LoraConfig] = None
deltatuner_config: Optional[DeltatunerConfig] = None
Expand Down Expand Up @@ -56,7 +55,7 @@ class Training(BaseModel):
resources_per_worker: RayResourceConfig
accelerate_mode: str
mixed_precision: str = "no"
gradient_accumulation_steps: int
gradient_accumulation_steps: int = 1
logging_steps: int = 10

@validator("device")
Expand All @@ -73,6 +72,13 @@ def check_accelerate_mode(cls, v: str):
raise ValueError(f"accelerate_mode must be one of {modes}")
return v

@validator("mixed_precision")
def check_mixed_precision(cls, v: str):
supported_precisions = ["no", "fp16", "bf16"]
if v not in supported_precisions:
raise ValueError(f"mixed_precision must be one of {supported_precisions}")
return v

@validator("logging_steps")
def check_logging_steps(cls, v: int):
assert v > 0
Expand Down
3 changes: 3 additions & 0 deletions finetune/models/bloom-560m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/finetune_config_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/gpt-j-6b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/gpt2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
6 changes: 6 additions & 0 deletions finetune/models/llama-2-7b-chat-hf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
target_modules:
- q_proj
- v_proj
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +32,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/mistral-7b-v0.1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ General:
- up_proj
- down_proj
- lm_head
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -37,3 +38,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10
3 changes: 3 additions & 0 deletions finetune/models/mpt-7b-chat.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +29,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 1
logging_steps: 10