@@ -26,6 +26,8 @@ class OverrideDefinitions:
2626
2727 override_args : Sequence [Sequence [str ]] = tuple (tuple (" " ))
2828 test_descr : str = "default"
29+ requires_seed_checkpoint : bool = False
30+ ngpu : int = 4
2931
3032
3133CONFIG_DIR = "./train_configs"
@@ -85,25 +87,104 @@ class OverrideDefinitions:
8587 ],
8688 "Checkpoint Integration Test - Save Model Weights Only bf16" ,
8789 ),
90+ OverrideDefinitions (
91+ [
92+ [
93+ "--checkpoint.enable_checkpoint" ,
94+ f"--checkpoint.folder { test_checkpoint_dir } _pp" ,
95+ "--experimental.pipeline_parallel_degree 2" ,
96+ "--experimental.pipeline_parallel_split_points layers.1" ,
97+ "--training.data_parallel_degree 1" ,
98+ "--model.norm_type rmsnorm" , # TODO fix fused_rmsnorm issue
99+ ],
100+ ],
101+ "PP 1D test" ,
102+ requires_seed_checkpoint = True ,
103+ ngpu = 2 ,
104+ ),
105+ OverrideDefinitions (
106+ [
107+ [
108+ "--checkpoint.enable_checkpoint" ,
109+ f"--checkpoint.folder { test_checkpoint_dir } _pp_dp" ,
110+ "--experimental.pipeline_parallel_degree 2" ,
111+ "--experimental.pipeline_parallel_split_points layers.1" ,
112+ "--training.data_parallel_degree 2" ,
113+ "--model.norm_type fused_rmsnorm" ,
114+ ],
115+ ],
116+ "PP+DP 2D test" ,
117+ requires_seed_checkpoint = True ,
118+ ),
119+ OverrideDefinitions (
120+ [
121+ [
122+ "--checkpoint.enable_checkpoint" ,
123+ f"--checkpoint.folder { test_checkpoint_dir } _pp_tp" ,
124+ "--experimental.pipeline_parallel_degree 2" ,
125+ "--experimental.pipeline_parallel_split_points layers.1" ,
126+ "--training.tensor_parallel_degree 2" ,
127+ "--model.norm_type rmsnorm" , # TODO fix fused_rmsnorm issue
128+ ],
129+ ],
130+ "PP+TP 2D test" ,
131+ requires_seed_checkpoint = True ,
132+ ),
133+ # oh.. not enough GPUs?
134+ # OverrideDefinitions(
135+ # [
136+ # [
137+ # "--checkpoint.enable_checkpoint",
138+ # f"--checkpoint.folder {test_checkpoint_dir}_pp_dp_tp",
139+ # "--experimental.pipeline_parallel_degree 2",
140+ # "--experimental.pipeline_parallel_split_points layers.1",
141+ # "--training.data_parallel_degree 2",
142+ # "--training.tensor_parallel_degree 2",
143+ # "--model.norm_type rmsnorm", # TODO fix fused_rmsnorm issue
144+ # ],
145+ # ],
146+ # "PP+DP+TP 3D test",
147+ # requires_seed_checkpoint=True,
148+ # ),
88149]
89150
90151
152+ def _run_cmd (cmd ):
153+ return subprocess .run (
154+ [cmd ],
155+ stdout = subprocess .PIPE ,
156+ stderr = subprocess .STDOUT ,
157+ text = True ,
158+ shell = True ,
159+ )
160+
161+
91162def run_test (test_flavor : OverrideDefinitions , full_path : str ):
92163 # run_test supports sequence of tests.
93164 for override_arg in test_flavor .override_args :
94- cmd = f"CONFIG_FILE={ full_path } NGPU=4 LOG_RANK=0,1,2,3 ./run_llama_train.sh"
165+
166+ cmd = f"CONFIG_FILE={ full_path } NGPU={ test_flavor .ngpu } LOG_RANK=0,1,2,3 ./run_llama_train.sh"
95167 if override_arg :
96168 cmd += " " + " " .join (override_arg )
97169 print (
98170 f"=====Integration test, flavor : { test_flavor .test_descr } , command : { cmd } ====="
99171 )
100- result = subprocess .run (
101- [cmd ],
102- stdout = subprocess .PIPE ,
103- stderr = subprocess .STDOUT ,
104- text = True ,
105- shell = True ,
106- )
172+
173+ if test_flavor .requires_seed_checkpoint :
174+ checkpoint_folder_arg = None
175+ for arg in override_arg :
176+ if "--checkpoint.folder" in arg :
177+ checkpoint_folder_arg = arg
178+ assert (
179+ checkpoint_folder_arg is not None
180+ ), "Can't use seed checkpoint if folder is not specified"
181+ print ("Creating seed checkpoint" )
182+ result = _run_cmd (
183+ f"CONFIG_FILE={ full_path } ./create_seed_checkpoint.sh { checkpoint_folder_arg } "
184+ )
185+ print (result .stdout )
186+
187+ result = _run_cmd (cmd )
107188 print (result .stdout )
108189 if result .returncode != 0 :
109190 raise Exception (
0 commit comments