Skip to content
This repository was archived by the owner on Sep 23, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/workflow_finetune.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ on:
default: '10.1.2.13:5000/llmray-build'
http_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'
https_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'
runner_config_path:
type: string
default: '/home/ci/llm-ray-actions-runner'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/workflow_inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ on:
default: '10.1.2.13:5000/llmray-build'
http_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'
https_proxy:
type: string
default: 'http://proxy-chain.intel.com:911'
default: 'http://10.24.221.149:911'
runner_config_path:
type: string
default: '/home/ci/llm-ray-actions-runner'
Expand Down
13 changes: 12 additions & 1 deletion common/model/huggingface_model_for_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,24 @@
class HuggingFaceModelForCausalLM(Model):
def __call__(self, config):
name = config.get("name")
model_dtype = config.get("dtype")
model_config = config.get("config", {})
model = transformers.AutoModelForCausalLM.from_pretrained(name, **model_config)
model = transformers.AutoModelForCausalLM.from_pretrained(
name, torch_dtype=model_dtype, **model_config
)

lora_config = config.get("lora_config", None)
if lora_config:
peft_config = LoraConfig(**lora_config)
model = get_peft_model(model, peft_config)
deltatuner_config = config.get("deltatuner_config", None)
if deltatuner_config:
model = deltatuner.optimize(model, **deltatuner_config)

enable_gradient_checkpointing = config.get("enable_gradient_checkpointing")
if enable_gradient_checkpointing:
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.config.use_cache = False

return model
23 changes: 18 additions & 5 deletions common/trainer/default_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ def _get_lr_scheduler(
num_steps_per_epoch,
accelerator,
):
# gradient_accumulation_steps = accelerator.gradient_accumulation_steps
# num_update_steps_per_epoch = math.ceil(num_steps_per_epoch / gradient_accumulation_steps)
enable = lr_scheduler_config.get("enable", False)
if not enable:
return None
Expand Down Expand Up @@ -153,7 +151,7 @@ def prepare(self, model, tokenizer, dataset, optimizer, accelerator):
def train(self):
num_train_epochs = self.config.get("num_train_epochs", 1)
checkpoint = self.config.get("checkpoint")
log_step = self.config.get("log_step", 1)
logging_steps = self.config.get("logging_steps", 1)
max_train_step = self.config.get("max_train_step")
max_eval_step = self.config.get("max_eval_step")
for idx in range(self.starting_epoch, num_train_epochs, 1):
Expand All @@ -170,12 +168,17 @@ def train(self):
if self.lr_scheduler is not None:
self.lr_scheduler.step()
self.optimizer.zero_grad()
if step % log_step == 0:

if step % logging_steps == 0:
loss = loss.item()
ppl = math.exp(loss)
logger.info(
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{math.exp(loss):.6f}\ttime:{time.time()-start:.6f}"
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
)
report(
{
"loss": loss,
"ppl": ppl,
"train_epoch": idx,
"total_epochs": num_train_epochs,
"train_step": step,
Expand All @@ -184,6 +187,10 @@ def train(self):
else total_steps,
}
)
self.accelerator.log(
{"train loss": loss, "train perplexity": ppl},
step=idx * total_steps + step,
)
start = time.time()
if max_train_step is not None:
if step >= max_train_step - 1:
Expand Down Expand Up @@ -214,6 +221,9 @@ def train(self):
except OverflowError:
eval_loss = float("inf")
perplexity = float("inf")
self.accelerator.log(
{"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
)
logger.info(
f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
)
Expand All @@ -232,6 +242,9 @@ def train(self):
save_function=self.accelerator.save,
)
logger.info(f"finish save model to {output}")

self.accelerator.end_training()

self.accelerator.wait_for_everyone()

def _get_local_path(self, root_path, model_name):
Expand Down
4 changes: 2 additions & 2 deletions common/trainer/rm_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def compute_loss(self, batch, return_outputs=False):

def train(self):
num_train_epochs = self.config.get("num_train_epochs", 1)
log_step = self.config.get("log_step", 1)
logging_steps = self.config.get("logging_steps", 1)
if not os.path.exists(self.config.get("log_path", ".")):
os.makedirs(self.config.get("log_path", "."), exist_ok=True)
writer = SummaryWriter(self.config.get("log_path", "."))
Expand All @@ -69,7 +69,7 @@ def train(self):
if self.lr_scheduler is not None:
self.lr_scheduler.step()
self.optimizer.zero_grad()
if step % log_step == 0:
if step % logging_steps == 0:
logger.info(
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{len(self.train_dataloader)}]\tloss:{loss}\tppl:{math.exp(loss)}\ttime:{time.time()-start}"
)
Expand Down
3 changes: 3 additions & 0 deletions docs/finetune_parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ The following are the parameters supported in the finetuning workflow.
|gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
|output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
|checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
|config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
|lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
|deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
|enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime|


## Dataset Parameters
Expand Down Expand Up @@ -40,3 +42,4 @@ The following are the parameters supported in the finetuning workflow.
|max_train_steps|None|Total number of training steps to perform. If provided, overrides epochs.|
|gradient_accumulation_steps|1|Number of updates steps to accumulate before performing a backward/update pass.|
|seed|None|A seed for reproducible training.|
|logging_steps|10|logging per steps|
53 changes: 44 additions & 9 deletions finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
from typing import Any, Dict, Union

import torch
import accelerate
from accelerate.utils import is_xpu_available

Expand Down Expand Up @@ -62,6 +63,14 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
return mode_env_vars[mode]


def convert_dtype(dtype: str) -> torch.dtype:
supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
if dtype in supported_dtypes:
return supported_dtypes[dtype]
else:
raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")


def train_func(config: Dict[str, Any]):
cwd = config.get("cwd")
if cwd:
Expand All @@ -79,9 +88,26 @@ def train_func(config: Dict[str, Any]):
)
else:
fsdp_plugin = None

log_with = "tensorboard" # only support tensorboard as tracker
output_dir = config["General"]["output_dir"]
tracking_dir = config["General"]["tracking_dir"]
accelerator = accelerate.Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps, fsdp_plugin=fsdp_plugin
gradient_accumulation_steps=gradient_accumulation_steps,
fsdp_plugin=fsdp_plugin,
log_with=log_with,
project_dir=tracking_dir,
)
epochs = config["Training"]["epochs"]
tracker_config = {
"epochs": epochs,
"learning_rate": config["Training"]["learning_rate"],
"batch_size": config["Training"]["batch_size"],
}
base_model = config["General"]["base_model"]
dataset_file = config["Dataset"]["train_file"]
accelerator.init_trackers("fine-tuning", config=tracker_config)

common.logger.info(
f"accelerator generate finish, accelerator device type = {accelerator.device}"
)
Expand All @@ -92,23 +118,25 @@ def train_func(config: Dict[str, Any]):

datasets = common.dataset.Dataset.registory.get("HuggingfaceDataset")()(
config={
"name": config["Dataset"]["train_file"],
"name": dataset_file,
"validation_file": config["Dataset"]["validation_file"],
"validation_split_percentage": config["Dataset"]["validation_split_percentage"],
}
)

tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()(
config={
"name": config["General"]["base_model"],
"name": base_model,
"config": config["General"]["config"],
}
)

model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
config={
"name": config["General"]["base_model"],
"name": base_model,
"dtype": convert_dtype(config["Training"]["mixed_precision"]),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here.

"config": config["General"]["config"],
"enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here.

"lora_config": config["General"]["lora_config"]
if config["General"].get("lora_config")
else None,
Expand All @@ -125,10 +153,10 @@ def train_func(config: Dict[str, Any]):

trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(
config={
"num_train_epochs": config["Training"]["epochs"],
"num_train_epochs": epochs,
"max_train_step": config["Training"].get("max_train_steps", None),
"log_step": 1,
"output": config["General"]["output_dir"],
"logging_steps": config["Training"].get("logging_steps", 1),
"output": output_dir,
"dataprocesser": {
"type": "GeneralProcesser",
"per_device_train_batch_size": config["Training"]["batch_size"],
Expand Down Expand Up @@ -217,14 +245,21 @@ def main(external_config=None):
"FI_PROVIDER": "tcp",
}
}

accelerate_env_vars = get_accelerate_environment_variable(accelerate_mode, config)
runtime_env["env_vars"].update(accelerate_env_vars)

if config["General"]["gpt_base_model"] is True:
runtime_env["pip"] = ["transformers==4.26.0"]

ray.init(runtime_env=runtime_env)
import intel_extension_for_pytorch as ipex

if "xpu" in ipex.__version__:
num_cpus = (
resources_per_worker["CPU"] * num_training_workers + 1
) # additional 1 for head worker
ray.init(num_cpus=num_cpus, runtime_env=runtime_env)
else:
ray.init(runtime_env=runtime_env)

common.logger.info(f"ray available resources = {ray.available_resources()}")

Expand Down
4 changes: 4 additions & 0 deletions finetune/finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ General:
gpt_base_model: true
output_dir: /tmp/llm-ray/output
checkpoint_dir: /tmp/llm-ray/checkpoint
tracking_dir: /tmp/llm-ray/tracking
config:
trust_remote_code: false
use_auth_token: null
Expand All @@ -11,6 +12,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +30,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 2
logging_steps: 10
9 changes: 9 additions & 0 deletions finetune/finetune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ class General(BaseModel):
gpt_base_model: bool
output_dir: str
checkpoint_dir: str
tracking_dir: str
config: GeneralConfig
lora_config: Optional[LoraConfig] = None
deltatuner_config: Optional[DeltatunerConfig] = None
enable_gradient_checkpointing: bool = False


class Dataset(BaseModel):
Expand All @@ -54,6 +56,8 @@ class Training(BaseModel):
resources_per_worker: RayResourceConfig
accelerate_mode: str
mixed_precision: str = "no"
gradient_accumulation_steps: int
logging_steps: int = 10

@validator("device")
def check_device(cls, v: str):
Expand All @@ -69,6 +73,11 @@ def check_accelerate_mode(cls, v: str):
raise ValueError(f"accelerate_mode must be one of {modes}")
return v

@validator("logging_steps")
def check_logging_steps(cls, v: int):
assert v > 0
return v

# @model_validator(mode='after')
# def check_device_and_accelerate_mode(self) -> "Training":
# dev = self.device
Expand Down
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies = [
"peft>=0.4.0",
"deltatuner==1.1.9",
"py-cpuinfo",
"pydantic-yaml",
"pydantic-yaml"
]

[project.optional-dependencies]
Expand All @@ -48,11 +48,11 @@ cpu = [

gpu = [
"transformers>=4.35.0",
"torch==2.0.1a0",
"torchvision==0.15.2a0",
"intel-extension-for-pytorch==2.0.110+xpu",
"oneccl_bind_pt==2.0.100+gpu",
"dpctl==0.14.5"
"torch==2.1.0a0",
"torchvision==0.16.0a0",
"intel_extension_for_pytorch==2.1.10+xpu",
"oneccl_bind_pt==2.1.100+xpu",
"dpctl==0.15.0"
]

deepspeed = [
Expand Down