Skip to content

Commit f5c4568

Browse files
quic-mamtamamtsing
authored andcommitted
[QEff Finetune]: Implement logger for finetuning and enable dumping (#371)
1. Implement logger for finetuning 2. enable dumping logs by given flag --------- Signed-off-by: Mamta Singh <[email protected]> Co-authored-by: Mamta Singh <[email protected]> Signed-off-by: Amit Raj <[email protected]>
1 parent fd93d8c commit f5c4568

File tree

13 files changed

+221
-154
lines changed

13 files changed

+221
-154
lines changed

QEfficient/cloud/finetune.py

Lines changed: 29 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#
66
# -----------------------------------------------------------------------------
77

8+
import logging
89
import random
910
import warnings
1011
from typing import Any, Dict, Optional, Union
@@ -17,7 +18,7 @@
1718
import torch.utils.data
1819
from peft import PeftModel, get_peft_model
1920
from torch.optim.lr_scheduler import StepLR
20-
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
21+
from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
2122

2223
from QEfficient.finetune.configs.training import TrainConfig
2324
from QEfficient.finetune.utils.config_utils import (
@@ -26,18 +27,22 @@
2627
update_config,
2728
)
2829
from QEfficient.finetune.utils.dataset_utils import get_dataloader
30+
from QEfficient.finetune.utils.logging_utils import logger
2931
from QEfficient.finetune.utils.parser import get_finetune_parser
30-
from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
31-
from QEfficient.utils._utils import login_and_download_hf_lm
32+
from QEfficient.finetune.utils.train_utils import (
33+
get_longest_seq_length,
34+
print_model_size,
35+
print_trainable_parameters,
36+
train,
37+
)
38+
from QEfficient.utils._utils import hf_download
3239

3340
# Try importing QAIC-specific module, proceed without it if unavailable
3441
try:
3542
import torch_qaic # noqa: F401
3643
except ImportError as e:
37-
print(f"Warning: {e}. Proceeding without QAIC modules.")
38-
44+
logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.", logging.WARNING)
3945

40-
from transformers import AutoModelForSequenceClassification
4146

4247
# Suppress all warnings
4348
warnings.filterwarnings("ignore")
@@ -106,7 +111,8 @@ def load_model_and_tokenizer(
106111
- Resizes model embeddings if tokenizer vocab size exceeds model embedding size.
107112
- Sets pad_token_id to eos_token_id if not defined in the tokenizer.
108113
"""
109-
pretrained_model_path = login_and_download_hf_lm(train_config.model_name)
114+
logger.log_rank_zero(f"Loading HuggingFace model for {train_config.model_name}")
115+
pretrained_model_path = hf_download(train_config.model_name)
110116
if train_config.task_type == "seq_classification":
111117
model = AutoModelForSequenceClassification.from_pretrained(
112118
pretrained_model_path,
@@ -116,7 +122,7 @@ def load_model_and_tokenizer(
116122
)
117123

118124
if not hasattr(model, "base_model_prefix"):
119-
raise RuntimeError("Given huggingface model does not have 'base_model_prefix' attribute.")
125+
logger.raise_error("Given huggingface model does not have 'base_model_prefix' attribute.", RuntimeError)
120126

121127
for param in getattr(model, model.base_model_prefix).parameters():
122128
param.requires_grad = False
@@ -141,11 +147,10 @@ def load_model_and_tokenizer(
141147
# If there is a mismatch between tokenizer vocab size and embedding matrix,
142148
# throw a warning and then expand the embedding matrix
143149
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
144-
print("WARNING: Resizing embedding matrix to match tokenizer vocab size.")
150+
logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.", logging.WARNING)
145151
model.resize_token_embeddings(len(tokenizer))
146152

147-
# FIXME (Meet): Cover below line inside the logger once it is implemented.
148-
print_model_size(model, train_config)
153+
print_model_size(model)
149154

150155
# Note: Need to call this before calling PeftModel.from_pretrained or get_peft_model.
151156
# Because, both makes model.is_gradient_checkpointing = True which is used in peft library to
@@ -157,7 +162,9 @@ def load_model_and_tokenizer(
157162
if hasattr(model, "supports_gradient_checkpointing") and model.supports_gradient_checkpointing:
158163
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"preserve_rng_state": False})
159164
else:
160-
raise RuntimeError("Given model doesn't support gradient checkpointing. Please disable it and run it.")
165+
logger.raise_error(
166+
"Given model doesn't support gradient checkpointing. Please disable it and run it.", RuntimeError
167+
)
161168

162169
model = apply_peft(model, train_config, peft_config_file, **kwargs)
163170

@@ -192,7 +199,7 @@ def apply_peft(
192199
else:
193200
peft_config = generate_peft_config(train_config, peft_config_file, **kwargs)
194201
model = get_peft_model(model, peft_config)
195-
model.print_trainable_parameters()
202+
print_trainable_parameters(model)
196203

197204
return model
198205

@@ -217,25 +224,26 @@ def setup_dataloaders(
217224
- Length of longest sequence in the dataset.
218225
219226
Raises:
220-
ValueError: If validation is enabled but the validation set is too small.
227+
RuntimeError: If validation is enabled but the validation set is too small.
221228
222229
Notes:
223230
- Applies a custom data collator if provided by get_custom_data_collator.
224231
- Configures DataLoader kwargs using get_dataloader_kwargs for train and val splits.
225232
"""
226233

227234
train_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="train")
228-
print(f"--> Num of Training Set Batches loaded = {len(train_dataloader)}")
235+
logger.log_rank_zero(f"Number of Training Set Batches loaded = {len(train_dataloader)}")
229236

230237
eval_dataloader = None
231238
if train_config.run_validation:
232239
eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="val")
233240
if len(eval_dataloader) == 0:
234-
raise ValueError(
235-
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
241+
logger.raise_error(
242+
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})",
243+
ValueError,
236244
)
237245
else:
238-
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
246+
logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")
239247

240248
longest_seq_length, _ = get_longest_seq_length(
241249
torch.utils.data.ConcatDataset([train_dataloader.dataset, eval_dataloader.dataset])
@@ -274,13 +282,15 @@ def main(peft_config_file: str = None, **kwargs) -> None:
274282
dataset_config = generate_dataset_config(train_config.dataset)
275283
update_config(dataset_config, **kwargs)
276284

285+
logger.prepare_for_logs(train_config.output_dir, train_config.dump_logs, train_config.log_level)
286+
277287
setup_distributed_training(train_config)
278288
setup_seeds(train_config.seed)
279289
model, tokenizer = load_model_and_tokenizer(train_config, dataset_config, peft_config_file, **kwargs)
280290

281291
# Create DataLoaders for the training and validation dataset
282292
train_dataloader, eval_dataloader, longest_seq_length = setup_dataloaders(train_config, dataset_config, tokenizer)
283-
print(
293+
logger.log_rank_zero(
284294
f"The longest sequence length in the train data is {longest_seq_length}, "
285295
f"passed context length is {train_config.context_length} and overall model's context length is "
286296
f"{model.config.max_position_embeddings}"

QEfficient/finetune/configs/training.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#
66
# -----------------------------------------------------------------------------
77

8+
import logging
89
from dataclasses import dataclass
910

1011

@@ -94,5 +95,7 @@ class TrainConfig:
9495
use_profiler: bool = False # Enable pytorch profiler, can not be used with flop counter at the same time.
9596
# profiler_dir: str = "PATH/to/save/profiler/results" # will be used if using profiler
9697

97-
dump_root_dir: str = "mismatches/step_"
9898
opByOpVerifier: bool = False
99+
100+
dump_logs: bool = True
101+
log_level: str = logging.INFO

QEfficient/finetune/dataset/alpaca_dataset.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import torch
1212
from torch.utils.data import Dataset
1313

14+
from QEfficient.finetune.utils.logging_utils import logger
15+
1416
PROMPT_DICT = {
1517
"prompt_input": (
1618
"Below is an instruction that describes a task, paired with an input that provides further context. "
@@ -27,7 +29,13 @@
2729

2830
class InstructionDataset(Dataset):
2931
def __init__(self, dataset_config, tokenizer, partition="train", context_length=None):
30-
self.ann = json.load(open(dataset_config.data_path))
32+
try:
33+
self.ann = json.load(open(dataset_config.data_path))
34+
except FileNotFoundError:
35+
logger.raise_error(
36+
"Loading of alpaca dataset failed! Please use (wget -c https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json -P dataset/) to download the alpaca dataset.",
37+
FileNotFoundError,
38+
)
3139
# Use 5% of the dataset for evaluation
3240
eval_length = int(len(self.ann) / 20)
3341
if partition == "train":

QEfficient/finetune/dataset/custom_dataset.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
import importlib
99
from pathlib import Path
1010

11+
from QEfficient.finetune.utils.logging_utils import logger
12+
1113

1214
def load_module_from_py_file(py_file: str) -> object:
1315
"""
@@ -30,20 +32,22 @@ def get_custom_dataset(dataset_config, tokenizer, split: str, context_length=Non
3032
module_path, func_name = dataset_config.file, "get_custom_dataset"
3133

3234
if not module_path.endswith(".py"):
33-
raise ValueError(f"Dataset file {module_path} is not a .py file.")
35+
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
3436

3537
module_path = Path(module_path)
3638
if not module_path.is_file():
37-
raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
39+
logger.raise_error(
40+
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
41+
)
3842

3943
module = load_module_from_py_file(module_path.as_posix())
4044
try:
4145
return getattr(module, func_name)(dataset_config, tokenizer, split, context_length)
42-
except AttributeError as e:
43-
print(
44-
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()})."
46+
except AttributeError:
47+
logger.raise_error(
48+
f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).",
49+
AttributeError,
4550
)
46-
raise e
4751

4852

4953
def get_data_collator(dataset_processer, dataset_config):
@@ -53,16 +57,20 @@ def get_data_collator(dataset_processer, dataset_config):
5357
module_path, func_name = dataset_config.file, "get_data_collator"
5458

5559
if not module_path.endswith(".py"):
56-
raise ValueError(f"Dataset file {module_path} is not a .py file.")
60+
logger.raise_error(f"Dataset file {module_path} is not a .py file.", ValueError)
5761

5862
module_path = Path(module_path)
5963
if not module_path.is_file():
60-
raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
64+
logger.raise_error(
65+
f"Dataset py file {module_path.as_posix()} does not exist or is not a file.", FileNotFoundError
66+
)
6167

6268
module = load_module_from_py_file(module_path.as_posix())
6369
try:
6470
return getattr(module, func_name)(dataset_processer)
6571
except AttributeError:
66-
print(f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()}).")
67-
print("Using the default data_collator instead.")
72+
logger.log_rank_zero(
73+
f"Can not find the custom data_collator in the dataset.py file ({module_path.as_posix()})."
74+
)
75+
logger.log_rank_zero("Using the default data_collator instead.")
6876
return None

QEfficient/finetune/dataset/grammar_dataset.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from datasets import load_dataset
1111
from torch.utils.data import Dataset
1212

13+
from QEfficient.finetune.utils.logging_utils import logger
14+
1315

1416
class grammar(Dataset):
1517
def __init__(self, tokenizer, csv_name=None, context_length=None):
@@ -19,11 +21,11 @@ def __init__(self, tokenizer, csv_name=None, context_length=None):
1921
data_files={"train": [csv_name]}, # "eval": "grammar_validation.csv"},
2022
delimiter=",",
2123
)
22-
except Exception as e:
23-
print(
24-
"Loading of grammar dataset failed! Please see [here](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset."
24+
except FileNotFoundError:
25+
logger.raise_error(
26+
"Loading of grammar dataset failed! Please check (https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb) for details on how to download the dataset.",
27+
FileNotFoundError,
2528
)
26-
raise e
2729

2830
self.context_length = context_length
2931
self.tokenizer = tokenizer
@@ -36,7 +38,7 @@ def convert_to_features(self, example_batch):
3638
# Create prompt and tokenize contexts and questions
3739

3840
if self.print_text:
39-
print("Input Text: ", self.clean_text(example_batch["text"]))
41+
logger.log_rank_zero("Input Text: ", self.clean_text(example_batch["text"]))
4042

4143
input_ = example_batch["input"]
4244
target_ = example_batch["target"]
@@ -71,9 +73,6 @@ def get_dataset(dataset_config, tokenizer, csv_name=None, context_length=None):
7173
"""cover function for handling loading the working dataset"""
7274
"""dataset loading"""
7375
currPath = Path.cwd() / "datasets_grammar" / "grammar_train.csv"
74-
print(f"Loading dataset {currPath}")
75-
csv_name = str(currPath)
76-
print(csv_name)
77-
dataset = grammar(tokenizer=tokenizer, csv_name=csv_name, context_length=context_length)
76+
dataset = grammar(tokenizer=tokenizer, csv_name=str(currPath), context_length=context_length)
7877

7978
return dataset

QEfficient/finetune/eval.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
from utils.train_utils import evaluation, print_model_size
2020

2121
from QEfficient.finetune.configs.training import TrainConfig
22+
from QEfficient.finetune.utils.logging_utils import logger
2223

2324
try:
2425
import torch_qaic # noqa: F401
2526

2627
device = "qaic:0"
2728
except ImportError as e:
28-
print(f"Warning: {e}. Moving ahead without these qaic modules.")
29+
logger.log_rank_zero(f"{e}. Moving ahead without these qaic modules.")
2930
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
3031

3132
# Suppress all warnings
@@ -77,25 +78,20 @@ def main(**kwargs):
7778
# If there is a mismatch between tokenizer vocab size and embedding matrix,
7879
# throw a warning and then expand the embedding matrix
7980
if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
80-
print("WARNING: Resizing the embedding matrix to match the tokenizer vocab size.")
81+
logger.log_rank_zero("Resizing the embedding matrix to match the tokenizer vocab size.")
8182
model.resize_token_embeddings(len(tokenizer))
8283

83-
print_model_size(model, train_config)
84+
print_model_size(model)
8485

8586
if train_config.run_validation:
86-
# TODO: vbaddi enable packing later in entire infra.
87-
# if train_config.batching_strategy == "packing":
88-
# dataset_val = ConcatDataset(dataset_val, chunk_size=train_config.context_length)
89-
9087
eval_dataloader = get_dataloader(tokenizer, dataset_config, train_config, split="test")
91-
92-
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
9388
if len(eval_dataloader) == 0:
94-
raise ValueError(
95-
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})"
89+
logger.raise_error(
90+
f"The eval set size is too small for dataloader to load even one batch. Please increase the size of eval set. ({len(eval_dataloader)=})",
91+
ValueError,
9692
)
9793
else:
98-
print(f"--> Num of Validation Set Batches loaded = {len(eval_dataloader)}")
94+
logger.log_rank_zero(f"Number of Validation Set Batches loaded = {len(eval_dataloader)}")
9995

10096
model.to(device)
10197
_ = evaluation(model, train_config, eval_dataloader, None, tokenizer, device)

0 commit comments

Comments
 (0)