Skip to content
This repository was archived by the owner on Sep 23, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion common/model/huggingface_model_for_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,24 @@
class HuggingFaceModelForCausalLM(Model):
def __call__(self, config):
name = config.get("name")
model_dtype = config.get("dtype")
model_config = config.get("config", {})
model = transformers.AutoModelForCausalLM.from_pretrained(name, **model_config)
model = transformers.AutoModelForCausalLM.from_pretrained(
name, torch_dtype=model_dtype, **model_config
)

lora_config = config.get("lora_config", None)
if lora_config:
peft_config = LoraConfig(**lora_config)
model = get_peft_model(model, peft_config)
deltatuner_config = config.get("deltatuner_config", None)
if deltatuner_config:
model = deltatuner.optimize(model, **deltatuner_config)

enable_gradient_checkpointing = config.get("enable_gradient_checkpointing")
if enable_gradient_checkpointing:
model.enable_input_require_grads()
model.gradient_checkpointing_enable()
model.config.use_cache = False

return model
23 changes: 18 additions & 5 deletions common/trainer/default_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,6 @@ def _get_lr_scheduler(
num_steps_per_epoch,
accelerator,
):
# gradient_accumulation_steps = accelerator.gradient_accumulation_steps
# num_update_steps_per_epoch = math.ceil(num_steps_per_epoch / gradient_accumulation_steps)
enable = lr_scheduler_config.get("enable", False)
if not enable:
return None
Expand Down Expand Up @@ -153,7 +151,7 @@ def prepare(self, model, tokenizer, dataset, optimizer, accelerator):
def train(self):
num_train_epochs = self.config.get("num_train_epochs", 1)
checkpoint = self.config.get("checkpoint")
log_step = self.config.get("log_step", 1)
logging_steps = self.config.get("logging_steps", 1)
max_train_step = self.config.get("max_train_step")
max_eval_step = self.config.get("max_eval_step")
for idx in range(self.starting_epoch, num_train_epochs, 1):
Expand All @@ -170,12 +168,17 @@ def train(self):
if self.lr_scheduler is not None:
self.lr_scheduler.step()
self.optimizer.zero_grad()
if step % log_step == 0:

if step % logging_steps == 0:
loss = loss.item()
ppl = math.exp(loss)
logger.info(
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{math.exp(loss):.6f}\ttime:{time.time()-start:.6f}"
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{total_steps}]\tloss:{loss:.6f}\tppl:{ppl:.6f}\ttime:{time.time()-start:.6f}"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of just output 0, 1, 2, etc, can we support output it like 0.1, 0.2, etc just like other workflows?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, will update in other PR.

)
report(
{
"loss": loss,
"ppl": ppl,
"train_epoch": idx,
"total_epochs": num_train_epochs,
"train_step": step,
Expand All @@ -184,6 +187,10 @@ def train(self):
else total_steps,
}
)
self.accelerator.log(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to use Ray's report or accelerator.log to log the metrics. Currently the code above logs the metrics twice, right? If Ray's report already meets our requirements, I think we don't need to use accelerator.log to log again?

{"train loss": loss, "train perplexity": ppl},
step=idx * total_steps + step,
)
start = time.time()
if max_train_step is not None:
if step >= max_train_step - 1:
Expand Down Expand Up @@ -214,6 +221,9 @@ def train(self):
except OverflowError:
eval_loss = float("inf")
perplexity = float("inf")
self.accelerator.log(
{"evaluate loss": eval_loss, "evaluate perplexity": perplexity}
)
logger.info(
f"eval epoch:[{idx}/{num_train_epochs}]\tloss:[{eval_loss:.6f}]\tppl:[{perplexity:.6f}]\ttime:[{time.time()-start:.6f}]"
)
Expand All @@ -232,6 +242,9 @@ def train(self):
save_function=self.accelerator.save,
)
logger.info(f"finish save model to {output}")

self.accelerator.end_training()

self.accelerator.wait_for_everyone()

def _get_local_path(self, root_path, model_name):
Expand Down
4 changes: 2 additions & 2 deletions common/trainer/rm_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def compute_loss(self, batch, return_outputs=False):

def train(self):
num_train_epochs = self.config.get("num_train_epochs", 1)
log_step = self.config.get("log_step", 1)
logging_steps = self.config.get("logging_steps", 1)
if not os.path.exists(self.config.get("log_path", ".")):
os.makedirs(self.config.get("log_path", "."), exist_ok=True)
writer = SummaryWriter(self.config.get("log_path", "."))
Expand All @@ -69,7 +69,7 @@ def train(self):
if self.lr_scheduler is not None:
self.lr_scheduler.step()
self.optimizer.zero_grad()
if step % log_step == 0:
if step % logging_steps == 0:
logger.info(
f"train epoch:[{idx}/{num_train_epochs}]\tstep:[{step}/{len(self.train_dataloader)}]\tloss:{loss}\tppl:{math.exp(loss)}\ttime:{time.time()-start}"
)
Expand Down
3 changes: 3 additions & 0 deletions docs/finetune_parameters.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ The following are the parameters supported in the finetuning workflow.
|gpt_base_model|True|This parameter is for [Transformers#22482](https://github.com/huggingface/transformers/issues/22482). It needs to be set to True when the pretrained model is realted to gpt, otherwise it is False.|
|output_dir|/tmp/llm-ray/output|The output directory to store the finetuned model|
|checkpoint_dir|/tmp/llm-ray/checkpoint|The directory to store checkpoint|
|tracking_dir|/tmp/llm-ray/tracking|The path to a directory for storing logs of locally-compatible loggers|
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we directly use the output_dir + "tracking" as the directory and not add this new parameter?

|config|trust_remote_code: False<br> use_auth_token: None|Will be passed to the transformers `from_pretrained()` method|
|lora_config|task_type: CAUSAL_LM<br>r: 8<br>lora_alpha: 32<br>lora_dropout: 0.1|Will be passed to the LoraConfig `__init__()` method, then it'll be used as config to build Peft model object.|
|deltatuner_config|"algo": "lora"<br>"denas": True<br>"best_model_structure": "/path/to/best_structure_of_deltatuner_model"|Will be passed to the DeltaTunerArguments `__init__()` method, then it'll be used as config to build [Deltatuner model](https://github.com/intel/e2eAIOK/tree/main/e2eAIOK/deltatuner) object.|
|enable_gradient_checkpointing|False|enable gradient checkpointing to save GPU memory, but will cost more compute runtime|


## Dataset Parameters
Expand Down Expand Up @@ -40,3 +42,4 @@ The following are the parameters supported in the finetuning workflow.
|max_train_steps|None|Total number of training steps to perform. If provided, overrides epochs.|
|gradient_accumulation_steps|1|Number of updates steps to accumulate before performing a backward/update pass.|
|seed|None|A seed for reproducible training.|
|logging_steps|10|logging per steps|
53 changes: 44 additions & 9 deletions finetune/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import argparse
from typing import Any, Dict, Union

import torch
import accelerate
from accelerate.utils import is_xpu_available

Expand Down Expand Up @@ -62,6 +63,14 @@ def get_accelerate_environment_variable(mode: str, config: Union[Dict[str, Any],
return mode_env_vars[mode]


def convert_dtype(dtype: str) -> torch.dtype:
supported_dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You passed mixed_precision as the parameter, its value could be "no", "fp16", "bf16" or "fp8". But "no" and "fp8" are not properly handled here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated.

if dtype in supported_dtypes:
return supported_dtypes[dtype]
else:
raise ValueError(f"only supported torch.dtype list [{supported_dtypes.keys()}]")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the check in finetune_config.py instead of here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, will update in other PR.



def train_func(config: Dict[str, Any]):
cwd = config.get("cwd")
if cwd:
Expand All @@ -79,9 +88,26 @@ def train_func(config: Dict[str, Any]):
)
else:
fsdp_plugin = None

log_with = "tensorboard" # only support tensorboard as tracker
output_dir = config["General"]["output_dir"]
tracking_dir = config["General"]["tracking_dir"]
accelerator = accelerate.Accelerator(
gradient_accumulation_steps=gradient_accumulation_steps, fsdp_plugin=fsdp_plugin
gradient_accumulation_steps=gradient_accumulation_steps,
fsdp_plugin=fsdp_plugin,
log_with=log_with,
project_dir=tracking_dir,
)
epochs = config["Training"]["epochs"]
tracker_config = {
"epochs": epochs,
"learning_rate": config["Training"]["learning_rate"],
"batch_size": config["Training"]["batch_size"],
}
base_model = config["General"]["base_model"]
dataset_file = config["Dataset"]["train_file"]
accelerator.init_trackers("fine-tuning", config=tracker_config)

common.logger.info(
f"accelerator generate finish, accelerator device type = {accelerator.device}"
)
Expand All @@ -92,23 +118,25 @@ def train_func(config: Dict[str, Any]):

datasets = common.dataset.Dataset.registory.get("HuggingfaceDataset")()(
config={
"name": config["Dataset"]["train_file"],
"name": dataset_file,
"validation_file": config["Dataset"]["validation_file"],
"validation_split_percentage": config["Dataset"]["validation_split_percentage"],
}
)

tokenizer = common.tokenizer.Tokenizer.registory.get("HuggingFaceTokenizer")()(
config={
"name": config["General"]["base_model"],
"name": base_model,
"config": config["General"]["config"],
}
)

model = common.model.Model.registory.get("HuggingFaceModelForCausalLM")()(
config={
"name": config["General"]["base_model"],
"name": base_model,
"dtype": convert_dtype(config["Training"]["mixed_precision"]),
"config": config["General"]["config"],
"enable_gradient_checkpointing": config["General"]["enable_gradient_checkpointing"],
"lora_config": config["General"]["lora_config"]
if config["General"].get("lora_config")
else None,
Expand All @@ -125,10 +153,10 @@ def train_func(config: Dict[str, Any]):

trainer = common.trainer.Trainer.registory.get("DefaultTrainer")(
config={
"num_train_epochs": config["Training"]["epochs"],
"num_train_epochs": epochs,
"max_train_step": config["Training"].get("max_train_steps", None),
"log_step": 1,
"output": config["General"]["output_dir"],
"logging_steps": config["Training"].get("logging_steps", 1),
"output": output_dir,
"dataprocesser": {
"type": "GeneralProcesser",
"per_device_train_batch_size": config["Training"]["batch_size"],
Expand Down Expand Up @@ -217,14 +245,21 @@ def main(external_config=None):
"FI_PROVIDER": "tcp",
}
}

accelerate_env_vars = get_accelerate_environment_variable(accelerate_mode, config)
runtime_env["env_vars"].update(accelerate_env_vars)

if config["General"]["gpt_base_model"] is True:
runtime_env["pip"] = ["transformers==4.26.0"]

ray.init(runtime_env=runtime_env)
import intel_extension_for_pytorch as ipex

if "xpu" in ipex.__version__:
num_cpus = (
resources_per_worker["CPU"] * num_training_workers + 1
) # additional 1 for head worker
ray.init(num_cpus=num_cpus, runtime_env=runtime_env)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need this change and can we avoid this? If we start Ray first, then execute the finetune command, do we still need this change and does this change still work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change is a workaround solution.
ray.init will be blocked when we use llm-on-ray workflow on Intel GPU with torch/ipex version 2.1+ if not cpu or gpu resources passed.

else:
ray.init(runtime_env=runtime_env)

common.logger.info(f"ray available resources = {ray.available_resources()}")

Expand Down
4 changes: 4 additions & 0 deletions finetune/finetune.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ General:
gpt_base_model: true
output_dir: /tmp/llm-ray/output
checkpoint_dir: /tmp/llm-ray/checkpoint
tracking_dir: /tmp/llm-ray/tracking
config:
trust_remote_code: false
use_auth_token: null
Expand All @@ -11,6 +12,7 @@ General:
r: 8
lora_alpha: 32
lora_dropout: 0.1
enable_gradient_checkpointing: false
Dataset:
train_file: examples/data/sample_finetune_data_small.jsonl
validation_file: null
Expand All @@ -28,3 +30,5 @@ Training:
resources_per_worker:
CPU: 32
accelerate_mode: CPU_DDP
gradient_accumulation_steps: 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The default value is 1 in our document. can you please set to 1 here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, will update in other PR.

logging_steps: 10
9 changes: 9 additions & 0 deletions finetune/finetune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@ class General(BaseModel):
gpt_base_model: bool
output_dir: str
checkpoint_dir: str
tracking_dir: str
config: GeneralConfig
lora_config: Optional[LoraConfig] = None
deltatuner_config: Optional[DeltatunerConfig] = None
enable_gradient_checkpointing: bool = False


class Dataset(BaseModel):
Expand All @@ -54,6 +56,8 @@ class Training(BaseModel):
resources_per_worker: RayResourceConfig
accelerate_mode: str
mixed_precision: str = "no"
gradient_accumulation_steps: int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you set the default value 1 here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, will update in other PR.

logging_steps: int = 10

@validator("device")
def check_device(cls, v: str):
Expand All @@ -69,6 +73,11 @@ def check_accelerate_mode(cls, v: str):
raise ValueError(f"accelerate_mode must be one of {modes}")
return v

@validator("logging_steps")
def check_logging_steps(cls, v: int):
assert v > 0
return v

# @model_validator(mode='after')
# def check_device_and_accelerate_mode(self) -> "Training":
# dev = self.device
Expand Down
12 changes: 6 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ dependencies = [
"peft>=0.4.0",
"deltatuner==1.1.9",
"py-cpuinfo",
"pydantic-yaml",
"pydantic-yaml"
]

[project.optional-dependencies]
Expand All @@ -48,11 +48,11 @@ cpu = [

gpu = [
"transformers>=4.35.0",
"torch==2.0.1a0",
"torchvision==0.15.2a0",
"intel-extension-for-pytorch==2.0.110+xpu",
"oneccl_bind_pt==2.0.100+gpu",
"dpctl==0.14.5"
"torch==2.1.0a0",
"torchvision==0.16.0a0",
"intel_extension_for_pytorch==2.1.10+xpu",
"oneccl_bind_pt==2.1.100+xpu",
"dpctl==0.15.0"
]

deepspeed = [
Expand Down