@@ -119,6 +119,8 @@ static std::vector<llama_device_memory_data> llama_get_device_memory_data(
119119 hp_n_ctx_train = model->hparams .n_ctx_train ;
120120 hp_n_expert = model->hparams .n_expert ;
121121
122+ llama_memory_breakdown_print (ctx); // goes to debug log
123+
122124 llama_free (ctx);
123125 llama_model_free (model);
124126 llama_log_set (ud.original_logger .callback , ud.original_logger .user_data );
@@ -142,6 +144,7 @@ bool llama_params_fit(
142144
143145 // step 1: get data for default parameters and check whether any changes are necessary in the first place
144146
147+ LLAMA_LOG_DEBUG (" %s: getting device memory data for initial parameters:\n " , __func__);
145148 const dmds_t dmds_full = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
146149 const size_t nd = devs.size (); // number of devices
147150 if (nd == 0 ) {
@@ -316,6 +319,7 @@ bool llama_params_fit(
316319 tensor_buft_overides[1 ] = {nullptr , nullptr };
317320 mparams->tensor_buft_overrides = tensor_buft_overides;
318321
322+ LLAMA_LOG_DEBUG (" %s: getting device memory data for all MoE tensors in system memory:\n " , __func__);
319323 const dmds_t dmds_cpu_moe = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
320324 int64_t global_surplus = 0 ;
321325 for (const llama_device_memory_data & dmd : dmds_cpu_moe) {
@@ -328,34 +332,59 @@ bool llama_params_fit(
328332 // step 3: for MoE models, if at least the dense tensors can be fit, try fitting as many full layers as possible
329333
330334 const uint32_t nl_scaling = hp_ngl / nd;
331- const std::vector<memory_scaling> spl_part = get_memory_scaling ( // size per device and per partial == Moe only layer
332- get_memory_for_const_layer (1 ), get_memory_for_const_layer (nl_scaling), nl_scaling);
335+ std::vector<memory_scaling> spl_part; // size per device and per partial == Moe only layer
336+ {
337+ LLAMA_LOG_DEBUG (" %s: getting device memory data for 1 layer + all MoE tensors in system memory:\n " , __func__);
338+ auto tmp1 = get_memory_for_const_layer (1 );
339+ LLAMA_LOG_DEBUG (" %s: getting device memory data for %" PRIu32 " layers + all MoE tensors in system memory:\n " , __func__, nl_scaling);
340+ auto tmpn = get_memory_for_const_layer (nl_scaling);
341+ spl_part = get_memory_scaling (tmp1, tmpn, nl_scaling);
342+ }
343+ for (size_t id = 0 ; id < nd; id++) {
344+ LLAMA_LOG_DEBUG (" %s: spl_part[%zu]: base=%" PRId64 " MiB, per_layer=%" PRId64 " MiB\n " ,
345+ __func__, id, spl_part[id].base /MiB, spl_part[id].per_layer /MiB);
346+ }
333347
334348 // for spl_part all MoE tensors were still on CPU, reset the TBOs so that all tensors are on the devices again
335349 tensor_buft_overides[0 ] = {nullptr , nullptr };
336350 mparams->tensor_buft_overrides = tensor_buft_overides;
337351
338- const std::vector<memory_scaling> spl_full = get_memory_scaling ( // size per device and per full layer
339- get_memory_for_const_layer (1 ), get_memory_for_const_layer (nl_scaling), nl_scaling);
352+ std::vector<memory_scaling> spl_full; // size per device and per full layer
353+ {
354+ LLAMA_LOG_DEBUG (" %s: getting device memory data for 1 layer + all tensors in device memory:\n " , __func__);
355+ auto tmp1 = get_memory_for_const_layer (1 );
356+ LLAMA_LOG_DEBUG (" %s: getting device memory data for %" PRIu32 " layers + all tensors in device memory:\n " , __func__, nl_scaling);
357+ auto tmpn = get_memory_for_const_layer (nl_scaling);
358+ spl_full = get_memory_scaling (tmp1, tmpn, nl_scaling);
359+ }
360+ for (size_t id = 0 ; id < nd; id++) {
361+ LLAMA_LOG_DEBUG (" %s: spl_full[%zu]: base=%" PRId64 " MiB, per_layer=%" PRId64 " MiB\n " ,
362+ __func__, id, spl_full[id].base /MiB, spl_full[id].per_layer /MiB);
363+ }
340364
341365 // the non-repeating tensors (e.g. output matrix) are difficult to quantify,
342366 // get memory use with all tensors on the last device and use that as the starting point for the last device only
343367 for (size_t id = 0 ; id < nd - 1 ; id++) {
344368 tensor_split[id] = 0 .0f ;
345369 }
346370 tensor_split[nd - 1 ] = 1 .0f ;
371+ LLAMA_LOG_DEBUG (" %s: getting device memory data with entire model on last device:\n " , __func__);
347372 const dmds_t dmds_last = llama_get_device_memory_data (path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level);
348373 tensor_split[nd - 1 ] = 0 .0f ;
349374
350375 struct ngl {
351376 uint32_t part = 0 ;
352377 uint32_t full = 0 ;
378+
379+ explicit operator std::string () const {
380+ return " [" + std::to_string (part) + " , " + std::to_string (full) + " ]" ;
381+ }
353382 };
354383
355384 // utility function that distributes layers to devices and returns whether the memory margin can be met on all devices
356385 // - ngl_per_device: resulting distribution of dense-only/full layers across devices
357386 // - global_ngl_part: total number of sense-only layers
358- auto distribute_layers = [&](std::vector<ngl> & ngl_per_device, const uint32_t global_ngl_part) -> bool {
387+ auto distribute_layers = [&](std::vector<ngl> & ngl_per_device, std::vector< int64_t > & usable_memory, const uint32_t global_ngl_part) -> bool {
359388 // reset result to initial state, initially put entire model on the last device
360389 for (size_t id = 0 ; id < nd - 1 ; id++) {
361390 ngl_per_device[id] = {0 , 0 };
@@ -364,16 +393,14 @@ bool llama_params_fit(
364393 ngl_per_device.back ().full = hp_ngl + 1 ;
365394
366395 // usable_memory: free memory above margin that can be used for further allocations
367- std::vector<int64_t > usable_memory;
368- usable_memory.reserve (nd);
369396 for (size_t id = 0 ; id < nd - 1 ; id++) {
370397 int64_t um = dmds_last[id].free - margin - spl_full[id].base ;
371398 um = std::max (um, int64_t (0 ));
372- usable_memory. push_back (um) ;
399+ usable_memory[id] = um ;
373400 }
374401 {
375402 const llama_memory_breakdown_data & mb = dmds_last.back ().mb ;
376- usable_memory.push_back ( dmds_last.back ().free - int64_t (mb.model + mb.context + mb.context ) - margin) ;
403+ usable_memory.back () = dmds_last.back ().free - int64_t (mb.model + mb.context + mb.context ) - margin;
377404 }
378405
379406 // convert some layers on the last device from full layers to dense-only layers
@@ -425,9 +452,21 @@ bool llama_params_fit(
425452
426453 // iteratively increase the number of partial layers until the memory consumption is low enough
427454 std::vector<ngl> ngl_per_device (nd);
428- for (uint32_t global_ngl_part = 0 ; global_ngl_part < hp_ngl; global_ngl_part++) {
429- if (distribute_layers (ngl_per_device, global_ngl_part)) {
430- break ;
455+ {
456+ std::vector<int64_t > usable_memory (nd);
457+ for (uint32_t global_ngl_part = 0 ; global_ngl_part < hp_ngl; global_ngl_part++) {
458+ const bool success = distribute_layers (ngl_per_device, usable_memory, global_ngl_part);
459+ std::string ngl_per_device_str = std::string (ngl_per_device[0 ]);
460+ std::string usable_memory_str = std::to_string (usable_memory[0 ]/MiB);
461+ for (size_t id = 1 ; id < nd; id++) {
462+ ngl_per_device_str += " , " + std::string (ngl_per_device[id]);
463+ usable_memory_str += " , " + std::to_string (usable_memory[id]/MiB);
464+ }
465+ LLAMA_LOG_DEBUG (" %s: global_ngl_part=%" PRIu32 " , success=%d, ngl_per_device=[%s], usable_memory[MiB]=[%s]\n " ,
466+ __func__, global_ngl_part, success ? 1 : 0 , ngl_per_device_str.c_str (), usable_memory_str.c_str ());
467+ if (success) {
468+ break ;
469+ }
431470 }
432471 }
433472
@@ -504,8 +543,14 @@ bool llama_params_fit(
504543 // all layers are the same so simply determine how many layers will fit per device
505544
506545 const uint32_t nl_scaling = hp_ngl / nd;
507- const std::vector<memory_scaling> ms = get_memory_scaling (
508- get_memory_for_const_layer (1 ), get_memory_for_const_layer (nl_scaling), nl_scaling);
546+ std::vector<memory_scaling> ms;
547+ {
548+ LLAMA_LOG_DEBUG (" %s: getting device memory data for 1 full layer:\n " , __func__);
549+ auto tmp1 = get_memory_for_const_layer (1 );
550+ LLAMA_LOG_DEBUG (" %s: getting device memory data for %" PRIu32 " full layers:\n " , __func__, nl_scaling);
551+ auto tmpn = get_memory_for_const_layer (nl_scaling);
552+ ms = get_memory_scaling (tmp1, tmpn, nl_scaling);
553+ }
509554
510555 mparams->n_gpu_layers = 0 ;
511556 std::vector<uint32_t > ngl_per_device;
0 commit comments