diff --git a/common/arg.cpp b/common/arg.cpp index a570810281499..da5510e946fd5 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -444,7 +445,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.kv_overrides.back().key[0] = 0; } - if (!params.tensor_buft_overrides.empty()) { + // pad tensor_buft_overrides for llama_params_fit: + const size_t ntbo = llama_max_tensor_buft_overrides(); + while (params.tensor_buft_overrides.size() < ntbo) { params.tensor_buft_overrides.push_back({nullptr, nullptr}); } @@ -1999,6 +2002,34 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } ).set_env("LLAMA_ARG_MAIN_GPU")); + add_opt(common_arg( + { "-fit", "--fit" }, "[on|off]", + string_format("whether to adjust unset arguments to fit in device memory ('on' or 'off', default: '%s')", params.fit_params ? "on" : "off"), + [](common_params & params, const std::string & value) { + if (is_truthy(value)) { + params.fit_params = true; + } else if (is_falsey(value)) { + params.fit_params = false; + } else { + throw std::runtime_error( + string_format("error: unkown value for --fit: '%s'\n", value.c_str())); + } + } + ).set_env("LLAMA_ARG_FIT")); + add_opt(common_arg( + { "-fitm", "--fit-margin" }, "MiB", + string_format("target margin per device for --fit option, default: %zu", params.fit_params_margin/(1024*1024)), + [](common_params & params, int value) { + params.fit_params_margin = value * size_t(1024*1024); + } + ).set_env("LLAMA_ARG_FIT_MARGIN")); + add_opt(common_arg( + { "-fitc", "--fit-ctx" }, "N", + string_format("minimum ctx size that can be set by --fit option, default: %" PRIu32, params.fit_params_min_ctx), + [](common_params & params, int value) { + params.fit_params_min_ctx = value; + } + ).set_env("LLAMA_ARG_FIT_CTX")); add_opt(common_arg( {"--check-tensors"}, string_format("check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false"), diff --git a/common/common.cpp b/common/common.cpp index a8d709ab1d050..307848efb3f97 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -949,6 +949,13 @@ std::vector fs_list_files(const std::string & path) { struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; auto mparams = common_model_params_to_llama(params); + auto cparams = common_context_params_to_llama(params); + + if (params.fit_params) { + llama_params_fit(params.model.path.c_str(), &mparams, &cparams, + params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_margin, params.fit_params_min_ctx, + params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + } llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { @@ -959,8 +966,6 @@ struct common_init_result common_init_from_params(common_params & params) { const llama_vocab * vocab = llama_model_get_vocab(model); - auto cparams = common_context_params_to_llama(params); - llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { LOG_ERR("%s: failed to create context with model '%s', try reducing --n-gpu-layers if you're running out of VRAM\n", diff --git a/common/common.h b/common/common.h index 8540725aaa476..8df80f7cc6f68 100644 --- a/common/common.h +++ b/common/common.h @@ -89,6 +89,7 @@ enum llama_example { LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_DIFFUSION, LLAMA_EXAMPLE_FINETUNE, + LLAMA_EXAMPLE_FIT_PARAMS, LLAMA_EXAMPLE_COUNT, }; @@ -274,8 +275,8 @@ struct lr_opt { struct ggml_opt_optimizer_params common_opt_lr_pars(void * userdata); struct common_params { - int32_t n_predict = -1; // new tokens to predict - int32_t n_ctx = 4096; // context size + int32_t n_predict = -1; // max. number of new tokens to predict, -1 == no limit + int32_t n_ctx = 0; // context size, 0 == context the model was trained with int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt @@ -296,9 +297,12 @@ struct common_params { // offload params std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + bool fit_params = true; // whether to fit unset model/context parameters to free device memory + size_t fit_params_margin = 1024 * 1024*1024; // margin per device in bytes for fitting parameters to free memory + int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h index 2cb150fd2a313..78aa059dde380 100644 --- a/ggml/include/ggml-alloc.h +++ b/ggml/include/ggml-alloc.h @@ -53,7 +53,14 @@ GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); // call with a worst-case graph to avoid buffer reallocations // not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed // returns false if the buffer allocation failed +// ggml_gallocr_resrve_n_size writes the buffer sizes per galloc buffer that would be allocated by ggml_gallocr_reserve_n to sizes GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API void ggml_gallocr_reserve_n_size( + ggml_gallocr_t galloc, + struct ggml_cgraph * graph, + const int * node_buffer_ids, + const int * leaf_buffer_ids, + size_t * sizes); GGML_API bool ggml_gallocr_reserve_n( ggml_gallocr_t galloc, struct ggml_cgraph * graph, @@ -68,6 +75,8 @@ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_i // Utils // Create a buffer and allocate all the tensors in a ggml_context +// ggml_backend_alloc_ctx_tensors_from_buft_size returns the size of the buffer that would be allocated by ggml_backend_alloc_ctx_tensors_from_buft +GGML_API size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index f1b740785914e..4ed5f35774ffc 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -307,6 +307,7 @@ extern "C" { GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph + GGML_API void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes); GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index c1ed1a21c81c4..3b7dc1bbfde82 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2511,7 +2511,8 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); + GGML_API void ggml_log_get(ggml_log_callback * log_callback, void ** user_data); + GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 91aff205f1832..45f014d846ba0 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -602,7 +602,9 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { } static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { - return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; + return t->data != NULL // tensor data already set externally + || t->buffer // tensor on external buffer (but not yet allocated) + || ggml_gallocr_is_own(galloc, t); // tensor will be allocated by galloc } // free the extra space at the end if the new tensor is smaller @@ -820,7 +822,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr } } -bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { +static bool ggml_gallocr_reserve_n_impl( + ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, bool no_alloc) { size_t min_hash_size = graph->n_nodes + graph->n_leafs; // add 25% margin to avoid hash collisions min_hash_size += min_hash_size / 4; @@ -922,14 +925,19 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c if (realloc) { #ifndef NDEBUG size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0; - GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); + GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", + __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); #endif ggml_vbuffer_free(galloc->buffers[i]); - galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); - if (galloc->buffers[i] == NULL) { - GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); - return false; + if (no_alloc) { + galloc->buffers[i] = NULL; + } else { + galloc->buffers[i] = ggml_vbuffer_alloc(galloc->bufts[i], galloc->buf_tallocs[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE); + if (galloc->buffers[i] == NULL) { + GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + return false; + } } } } @@ -937,6 +945,21 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c return true; } +void ggml_gallocr_reserve_n_size( + ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids, size_t * sizes) { + GGML_ASSERT(ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ true)); + for (int i = 0; i < galloc->n_buffers; i++) { + sizes[i] = 0; + for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) { + sizes[i] += galloc->buf_tallocs[i]->chunks[c]->max_size; + } + } +} + +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { + return ggml_gallocr_reserve_n_impl(galloc, graph, node_buffer_ids, leaf_buffer_ids, /*no_alloc =*/ false); +} + bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL); } @@ -1139,7 +1162,8 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return true; } -ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { +static ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft_impl( + struct ggml_context * ctx, ggml_backend_buffer_type_t buft, size_t * nbytes_total, bool no_alloc) { GGML_ASSERT(ggml_get_no_alloc(ctx) == true); size_t alignment = ggml_backend_buft_get_alignment(buft); @@ -1147,6 +1171,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte ggml_backend_buffer_t * buffers = NULL; size_t n_buffers = 0; + *nbytes_total = 0; size_t cur_buf_size = 0; struct ggml_tensor * first = ggml_get_first_tensor(ctx); @@ -1158,10 +1183,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (cur_buf_size > 0 && (cur_buf_size + this_size) > max_size) { // allocate tensors in the current buffer - if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { + if (!no_alloc && !alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; } first = t; + *nbytes_total += cur_buf_size; cur_buf_size = this_size; } else { cur_buf_size += this_size; @@ -1170,15 +1196,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte // allocate remaining tensors if (cur_buf_size > 0) { - if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { + *nbytes_total += cur_buf_size; + if (!no_alloc && !alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; } } + if (no_alloc) { + return NULL; + } + if (n_buffers == 0) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__); #endif + GGML_ASSERT(!buffers); return NULL; } @@ -1188,10 +1220,24 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } else { buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers); } - free(buffers); + if (buffers) { + free(buffers); // can be NULL if context is empty or no_alloc + } return buffer; } +size_t ggml_backend_alloc_ctx_tensors_from_buft_size(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc=*/ true); + GGML_ASSERT(!buf); + return nbytes_total; +} + +ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) { + size_t nbytes_total = 0; + return ggml_backend_alloc_ctx_tensors_from_buft_impl(ctx, buft, &nbytes_total, /*no_alloc =*/ false); +} + ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) { return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend)); } diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index ff9135fe2d878..a4507da93363c 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -36,12 +36,11 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { } ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + GGML_ASSERT(buft); if (size == 0) { // return a dummy buffer for zero-sized allocations return ggml_backend_buffer_init(buft, {}, NULL, 0); } - - GGML_ASSERT(buft); return buft->iface.alloc_buffer(buft, size); } @@ -1694,6 +1693,20 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched->is_alloc = false; } +void ggml_backend_sched_reserve_size(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph, size_t * sizes) { + GGML_ASSERT(sched); + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); + GGML_ASSERT(sizes); + + ggml_backend_sched_reset(sched); + + ggml_backend_sched_synchronize(sched); + + ggml_backend_sched_split_graph(sched, measure_graph); + + ggml_gallocr_reserve_n_size(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids, sizes); +} + bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { GGML_ASSERT(sched); GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 9be35c1be8456..bb53290fdacea 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7345,6 +7345,11 @@ size_t ggml_quantize_chunk( //////////////////////////////////////////////////////////////////////////////// +void ggml_log_get(ggml_log_callback * log_callback, void ** user_data) { + *log_callback = g_logger_state.log_callback; + *user_data = g_logger_state.log_callback_user_data; +} + void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; diff --git a/include/llama.h b/include/llama.h index 8547226ff210c..0af3c94c1cdcd 100644 --- a/include/llama.h +++ b/include/llama.h @@ -298,6 +298,7 @@ extern "C" { bool check_tensors; // validate model tensor data bool use_extra_bufts; // use extra buffer types (used for weight repacking) bool no_host; // bypass host buffer allowing extra buffers to be used + bool no_alloc; // only load metadata and simulate memory allocations }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations @@ -451,10 +452,23 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); + // fits mparams and cparams to free device memory (assumes system memory is unlimited) + // returns true if the parameters could be successfully modified to fit device memory + LLAMA_API bool llama_params_fit( + const char * path_model, + struct llama_model_params * mparams, + struct llama_context_params * cparams, + float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements + struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements + size_t margin, // margin of memory to leave per device in bytes + uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use + enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log + LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_max_devices(void); LLAMA_API size_t llama_max_parallel_sequences(void); + LLAMA_API size_t llama_max_tensor_buft_overrides(void); LLAMA_API bool llama_supports_mmap (void); LLAMA_API bool llama_supports_mlock (void); @@ -1336,7 +1350,8 @@ extern "C" { // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. - LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); + LLAMA_API void llama_log_get(ggml_log_callback * log_callback, void ** user_data); + LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); // // Performance utils diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 70a3ec62dfc63..cfdf289efb67c 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -227,6 +227,7 @@ llama_context::llama_context( backend_buft.clear(); backend_ptrs.clear(); + backend_buf_exp_size.clear(); for (auto & backend : backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); @@ -243,6 +244,7 @@ llama_context::llama_context( backend_buft.push_back(buft); backend_ptrs.push_back(backend.get()); + backend_buf_exp_size.push_back(0); } LLAMA_LOG_DEBUG("%s: backend_ptrs.size() = %zu\n", __func__, backend_ptrs.size()); @@ -358,7 +360,8 @@ llama_context::llama_context( // reserve pp (prompt processing) graph first so that buffers are only allocated once { - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), + model.hparams.no_alloc, model.hparams.no_alloc ? backend_buf_exp_size.data() : nullptr); if (!gf) { if (pipeline_parallel) { LLAMA_LOG_WARN("%s: compute buffer allocation failed, retrying without pipeline parallelism\n", __func__); @@ -376,7 +379,7 @@ llama_context::llama_context( // reserve with tg (token generation) graph to get the number of splits and nodes { - auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get()); + auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get(), model.hparams.no_alloc); if (!gf) { throw std::runtime_error("failed to allocate compute tg buffers"); } @@ -391,7 +394,7 @@ llama_context::llama_context( // // auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get()); // - auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get()); + auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get(), model.hparams.no_alloc); if (!gf) { throw std::runtime_error("failed to allocate compute pp buffers"); } @@ -400,11 +403,13 @@ llama_context::llama_context( for (size_t i = 0; i < backend_ptrs.size(); ++i) { ggml_backend_t backend = backend_ptrs[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; - size_t size = ggml_backend_sched_get_buffer_size(sched.get(), backend); - if (size > 1) { + if (!model.hparams.no_alloc) { + backend_buf_exp_size[i] = ggml_backend_sched_get_buffer_size(sched.get(), backend); + } + if (backend_buf_exp_size[i] > 1) { LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); + backend_buf_exp_size[i] / 1024.0 / 1024.0); } } @@ -423,6 +428,22 @@ llama_context::llama_context( } llama_context::~llama_context() { + if (!model.hparams.no_alloc) { + for (size_t i = 0; i < backend_ptrs.size(); ++i) { + ggml_backend_t backend = backend_ptrs[i]; + ggml_backend_buffer_type_t buft = backend_buft[i]; + + const size_t size_exp = backend_buf_exp_size[i]; + const size_t size_act = ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (size_exp == size_act) { + LLAMA_LOG_DEBUG("%s: %10s compute buffer size is %8.4f MiB, matches expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } else { + LLAMA_LOG_WARN("%s: %10s compute buffer size of %8.4f MiB, does not match expectation of %8.4f MiB\n", + __func__, ggml_backend_buft_name(buft), size_act / (1024.0*1024.0), size_exp / (1024.0*1024.0)); + } + } + } ggml_opt_free(opt_ctx); } @@ -1393,7 +1414,8 @@ llm_graph_result * llama_context::get_gf_res_reserve() const { return static_cast(gf_res_reserve.get()); } -ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only) { +ggml_cgraph * llama_context::graph_reserve( + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only, size_t * sizes) { LLAMA_LOG_DEBUG("%s: reserving a graph for ubatch with n_tokens = %4u, n_seqs = %2u, n_outputs = %4u\n", __func__, n_tokens, n_seqs, n_outputs); GGML_ASSERT(n_outputs >= 1); @@ -1430,8 +1452,13 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u // initialize scheduler with the specified graph if (split_only) { - ggml_backend_sched_split_graph(sched.get(), gf); + if (sizes) { + ggml_backend_sched_reserve_size(sched.get(), gf, sizes); + } else { + ggml_backend_sched_split_graph(sched.get(), gf); + } } else if (!ggml_backend_sched_reserve(sched.get(), gf)) { + GGML_ASSERT(!sizes); LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); return nullptr; } @@ -2053,15 +2080,26 @@ void llama_context::perf_reset() { std::map llama_context::memory_breakdown() const { std::map ret; - for (const auto & buft_size : model.memory_breakdown()) { - ret[buft_size.first].model += buft_size.second; + for (const auto & [buft, size] : model.memory_breakdown()) { + ret[buft].model += size; } - for (const auto & buft_size : memory->memory_breakdown()) { - ret[buft_size.first].context += buft_size.second; + if (memory) { + for (const auto & [buft, size] : memory->memory_breakdown()) { + ret[buft].context += size; + } } - for (const auto & backend_ptr : backends) { - ggml_backend_t backend = backend_ptr.get(); - ret[ggml_backend_sched_get_buffer_type(sched.get(), backend)].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + if (model.hparams.no_alloc) { + for (size_t i = 0; i < backends.size(); ++i) { + ggml_backend_t backend = backends[i].get(); + ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend); + ret[buft].compute += backend_buf_exp_size[i]; + } + } else { + for (const auto & backend_ptr : backends) { + ggml_backend_t backend = backend_ptr.get(); + ggml_backend_buffer_type_t buft = ggml_backend_sched_get_buffer_type(sched.get(), backend); + ret[buft].compute += ggml_backend_sched_get_buffer_size(sched.get(), backend); + } } return ret; } diff --git a/src/llama-context.h b/src/llama-context.h index 20cbd78955412..f639867dc2882 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -26,6 +26,10 @@ struct llama_memory_breakdown_data { size_t model = 0; // memory allocated for the model size_t context = 0; // memory allocated for the context size_t compute = 0; // memory allocated for temporary compute buffers + + size_t total() const { + return model + context + compute; + } }; struct llama_context { @@ -206,7 +210,8 @@ struct llama_context { ggml_status graph_compute(ggml_cgraph * gf, bool batched); // reserve a graph with a dummy ubatch of the specified size - ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false); + ggml_cgraph * graph_reserve( + uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx, bool split_only = false, size_t * sizes = nullptr); private: llm_graph_params graph_params( @@ -281,9 +286,10 @@ struct llama_context { std::vector> set_n_threads_fns; - // buffer types used for the compute buffer of each backend + // pointers and buffer types used for the compute buffer of each backend std::vector backend_ptrs; std::vector backend_buft; + std::vector backend_buf_exp_size; // expected buffer sizes llm_graph_result_ptr gf_res_prev; llm_graph_result_ptr gf_res_reserve; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 9203af83b2e32..5caca95b38be5 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -34,6 +34,7 @@ struct llama_hparams_convnext { struct llama_hparams { bool vocab_only; + bool no_alloc; bool rope_finetuned; bool use_par_res; bool swin_norm; diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd323a6..87b9d8bb27a55 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -25,6 +25,10 @@ time_meas::~time_meas() { } } +void llama_log_get(ggml_log_callback * log_callback, void ** user_data) { + ggml_log_get(log_callback, user_data); +} + void llama_log_set(ggml_log_callback log_callback, void * user_data) { ggml_log_set(log_callback, user_data); g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index e26385a1feaf1..eb0e6a260432b 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -175,7 +175,15 @@ llama_kv_cache::llama_kv_cache( // allocate tensors and initialize the buffers to avoid NaNs in the padding for (auto & [buft, ctx] : ctx_map) { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); + ggml_backend_buffer_t buf; + if (model.hparams.no_alloc) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer + for (ggml_tensor * t = ggml_get_first_tensor(ctx.get()); t != nullptr; t = ggml_get_next_tensor(ctx.get(), t)) { + t->buffer = buf; // set dummy buffer for KV cache so that the backend scheduler won't try to allocate it + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx.get(), buft); // real buffer + } if (!buf) { throw std::runtime_error("failed to allocate buffer for kv cache"); } @@ -482,9 +490,18 @@ llama_pos llama_kv_cache::seq_pos_max(llama_seq_id seq_id) const { std::map llama_kv_cache::memory_breakdown() const { std::map ret; - for (const auto & [_, buf] : ctxs_bufs) { - ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + for (const auto & [ctx, buf] : ctxs_bufs) { + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf.get()); + + if (hparams.no_alloc) { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) == nullptr); + ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft); + } else { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); + ret[buft] += ggml_backend_buffer_get_size(buf.get()); + } } + return ret; } diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index aa3a65f87a542..ca2ea2461d223 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -473,6 +473,7 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, + bool no_alloc, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; @@ -716,6 +717,7 @@ llama_model_loader::llama_model_loader( this->use_mmap = use_mmap; this->check_tensors = check_tensors; + this->no_alloc = no_alloc; } std::string llama_model_loader::get_arch_name() const { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index c9189f6cb4466..0380c92fde0e3 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -71,6 +71,7 @@ struct llama_model_loader { bool use_mmap = false; bool check_tensors; + bool no_alloc; llama_files files; llama_ftype ftype; @@ -97,6 +98,7 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, + bool no_alloc, const llama_model_kv_override * param_overrides_p, const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 829f1e3c14f82..40ed7ce9f7320 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6353,9 +6353,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { std::vector bufs; if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { + GGML_ASSERT(!ml.no_alloc); for (uint32_t idx = 0; idx < ml.files.size(); idx++) { // only the mmap region containing the tensors in the model is mapped to the backend buffer - // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, + // then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size void * addr = nullptr; size_t first, last; // NOLINT @@ -6371,9 +6373,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { bufs.emplace_back(buf); buf_map.emplace(idx, buf); } - } - else { - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + } else { + ggml_backend_buffer_t buf; + if (ml.no_alloc) { + buf = ggml_backend_buft_alloc_buffer(buft, /*size =*/ 0); // dummy buffer + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + t->buffer = buf; // set dummy buffer for weights so that the backend scheduler won't try to allocate them + } + } else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); // real buffer + } if (buf == nullptr) { throw std::runtime_error(format("unable to allocate %s buffer", ggml_backend_buft_name(buft))); } @@ -6428,6 +6437,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } + if (ml.no_alloc) { + return true; + } + // load tensor data for (auto & [ctx, buf_map] : ctx_buf_maps) { if (!ml.load_all_data(ctx, buf_map, use_mlock ? &pimpl->mlock_mmaps : NULL, params.progress_callback, params.progress_callback_user_data)) { @@ -6470,9 +6483,18 @@ size_t llama_model::n_devices() const { std::map llama_model::memory_breakdown() const { std::map ret; - for (const auto & [_, bufs] : pimpl->ctxs_bufs) { - for (const auto & buf : bufs) { - ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) { + if (hparams.no_alloc) { + GGML_ASSERT(bufs.size() == 1); + ggml_backend_buffer_t buf = bufs[0].get(); + GGML_ASSERT(ggml_backend_buffer_get_base(buf) == nullptr); + ggml_backend_buffer_type_t buft = ggml_backend_buffer_get_type(buf); + ret[buft] += ggml_backend_alloc_ctx_tensors_from_buft_size(ctx.get(), buft); + } else { + for (const auto & buf : bufs) { + GGML_ASSERT(ggml_backend_buffer_get_base(buf.get()) != nullptr); + ret[ggml_backend_buffer_get_type(buf.get())] += ggml_backend_buffer_get_size(buf.get()); + } } } return ret; @@ -6517,6 +6539,7 @@ void llama_model::print_info() const { // hparams LLAMA_LOG_INFO("%s: arch = %s\n", __func__, arch_name().c_str()); LLAMA_LOG_INFO("%s: vocab_only = %d\n", __func__, hparams.vocab_only); + LLAMA_LOG_INFO("%s: no_alloc = %d\n", __func__, hparams.no_alloc); if (!hparams.vocab_only) { LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); @@ -7342,6 +7365,7 @@ llama_model_params llama_model_default_params() { /*.check_tensors =*/ false, /*.use_extra_bufts =*/ true, /*.no_host =*/ false, + /*.no_alloc =*/ false, }; return result; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a56b2626ae1c5..6e5269804d93f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -596,7 +596,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index ab2e9868af468..b9590c8c7db65 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -1,6 +1,9 @@ +#include "llama.h" + #include "llama-impl.h" #include "llama-chat.h" +#include "llama-context.h" #include "llama-mmap.h" #include "llama-vocab.h" #include "llama-model-loader.h" @@ -11,11 +14,14 @@ #include "ggml-backend.h" #include +#include +#include #include #include #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -37,6 +43,591 @@ const char * llama_flash_attn_type_name(enum llama_flash_attn_type flash_attn_ty GGML_ABORT("fatal error"); } +struct llama_device_memory_data { + int64_t total; + int64_t free; + llama_memory_breakdown_data mb; +}; + +static std::vector llama_get_device_memory_data( + const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams, + std::vector & devs, uint32_t & hp_ngl, uint32_t & hp_n_ctx_train, uint32_t & hp_n_expert, + uint32_t & hp_n_layers_dense_lead, const ggml_log_level log_level) { + struct user_data_t { + struct { + ggml_log_callback callback; + void * user_data; + } original_logger; + ggml_log_level min_level; // prints below this log level go to debug log + }; + user_data_t ud; + llama_log_get(&ud.original_logger.callback, &ud.original_logger.user_data); + ud.min_level = log_level; + + llama_log_set([](ggml_log_level level, const char * text, void * user_data) { + const user_data_t * ud = (const user_data_t *) user_data; + const ggml_log_level level_eff = level >= ud->min_level ? level : GGML_LOG_LEVEL_DEBUG; + ud->original_logger.callback(level_eff, text, ud->original_logger.user_data); + }, &ud); + + llama_model_params mparams_copy = *mparams; + mparams_copy.no_alloc = true; + mparams_copy.use_mmap = false; + + llama_model * model = llama_model_load_from_file(path_model, mparams_copy); + if (model == nullptr) { + throw std::runtime_error("failed to load model"); + } + + llama_context * ctx = llama_init_from_model(model, *cparams); + if (ctx == nullptr) { + llama_model_free(model); + throw std::runtime_error("failed to create llama_context from model"); + } + + std::vector ret(model->devices.size()); + + std::map memory_breakdown = ctx->memory_breakdown(); + + for (const auto & buft_mb : memory_breakdown) { + ggml_backend_buffer_type_t buft = buft_mb.first; + const llama_memory_breakdown_data & mb = buft_mb.second; + + if (ggml_backend_buft_is_host(buft)) { + continue; + } + + ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); + if (!dev) { + continue; + } + for (size_t i = 0; i < ret.size(); i++) { + if (model->devices[i] == dev) { + ret[i].mb.model += mb.model; + ret[i].mb.context += mb.context; + ret[i].mb.compute += mb.compute; + break; + } + } + } + for (size_t i = 0; i < ret.size(); i++) { + size_t free, total; + ggml_backend_dev_memory(model->devices[i], &free, &total); + ret[i].free = free; + ret[i].total = total; + } + + devs = model->devices; + hp_ngl = model->hparams.n_layer; + hp_n_ctx_train = model->hparams.n_ctx_train; + hp_n_expert = model->hparams.n_expert; + hp_n_layers_dense_lead = model->hparams.n_layer_dense_lead; + + llama_memory_breakdown_print(ctx); // goes to debug log + + llama_free(ctx); + llama_model_free(model); + llama_log_set(ud.original_logger.callback, ud.original_logger.user_data); + return ret; +} + + +static void llama_params_fit_impl( + const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, + float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, + size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { + constexpr int64_t MiB = 1024*1024; + const int64_t margin = margin_s; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits + typedef std::vector dmds_t; + const llama_model_params default_mparams = llama_model_default_params(); + + std::vector devs; + uint32_t hp_ngl = 0; // hparams.n_gpu_layers + uint32_t hp_nct = 0; // hparams.n_ctx_train + uint32_t hp_nex = 0; // hparams.n_expert + uint32_t hp_nldl = 0; // hparams.n_layers_dense_lead + + // step 1: get data for default parameters and check whether any changes are necessary in the first place + + LLAMA_LOG_DEBUG("%s: getting device memory data for initial parameters:\n", __func__); + const dmds_t dmds_full = llama_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, hp_nldl, log_level); + const size_t nd = devs.size(); // number of devices + if (nd == 0) { + LLAMA_LOG_INFO("%s: no devices with dedicated memory found\n", __func__); + return; + } + + std::vector dev_names; + { + dev_names.reserve(nd); + size_t max_length = 0; + for (ggml_backend_dev_t dev : devs) { + std::string name = ggml_backend_dev_name(dev); + name += " ("; + name += ggml_backend_dev_description(dev); + name += ")"; + dev_names.push_back(name); + max_length = std::max(max_length, name.length()); + } + for (std::string & dn : dev_names) { + dn.insert(dn.end(), max_length - dn.length(), ' '); + } + } + + int64_t sum_total = 0; + int64_t sum_projected_free = 0; + int64_t min_projected_free = INT64_MAX; + int64_t sum_projected_used = 0; + int64_t sum_projected_ctx = 0; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: projected memory use with initial parameters [MiB]:\n", __func__); + } + for (size_t id = 0; id < nd; id++) { + const llama_device_memory_data & dmd = dmds_full[id]; + + const int64_t projected_used = dmd.mb.total(); + const int64_t projected_free = dmd.free - projected_used; + + sum_total += dmd.total; + sum_projected_used += projected_used; + sum_projected_free += projected_free; + min_projected_free = std::min(min_projected_free, projected_free); + sum_projected_ctx += dmd.mb.context; + + if (nd > 1) { + LLAMA_LOG_INFO("%s: - %s: %6" PRId64 " total, %6" PRId64 " used, %6" PRId64 " %s\n", + __func__, dev_names[id].c_str(), dmd.total/MiB, projected_used/MiB, std::abs(projected_free)/MiB, + projected_free >= 0 ? "surplus" : "deficit"); + } + } + assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0); + assert(sum_projected_used >= sum_projected_ctx); + LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n", + __func__, sum_projected_used/MiB, sum_total/MiB); + if (min_projected_free >= margin) { + if (nd == 1) { + LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n", + __func__, min_projected_free/MiB, margin/MiB); + return; + } + LLAMA_LOG_INFO("%s: will leave at least %" PRId64 " >= %" PRId64 " MiB of free memory on all devices, no changes needed\n", + __func__, min_projected_free/MiB, margin/MiB); + return; + } + + // step 2: try reducing memory use by reducing the context size + + { + int64_t global_surplus = sum_projected_free - int64_t(nd)*margin; + if (global_surplus < 0) { + LLAMA_LOG_INFO(nd == 1 ? + "%s: cannot fulfill margin of %" PRId64 " MiB, need to reduce device memory by %" PRId64 " MiB\n" : + "%s: cannot fulfill margin of %" PRId64 " MiB on all devices, need to use %" PRId64 " MiB less in total\n", + __func__, margin/MiB, -global_surplus/MiB); + if (cparams->n_ctx == 0) { + if (hp_nct > n_ctx_min) { + const int64_t bytes_per_ctx = sum_projected_ctx / hp_nct; + const uint32_t ctx_reduction = std::min( + uint32_t((-global_surplus + bytes_per_ctx - 1) / bytes_per_ctx), hp_nct - n_ctx_min); + cparams->n_ctx = hp_nct - ctx_reduction; + const int64_t memory_reduction = ctx_reduction * bytes_per_ctx; + global_surplus += memory_reduction; + LLAMA_LOG_INFO("%s: context size reduced from %" PRIu32 " to %" PRIu32 " -> need %" PRId64 " MiB less memory in total\n", + __func__, hp_nct, cparams->n_ctx, memory_reduction/MiB); + } else { + LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n", + __func__, hp_nct, n_ctx_min); + } + } else { + LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx); + } + } + if (global_surplus > 0) { + LLAMA_LOG_INFO("%s: entire model can be fit across devices by reducing context\n", __func__); + return; + } + } + + if (mparams->n_gpu_layers != default_mparams.n_gpu_layers) { + throw std::runtime_error("n_gpu_layers already set by user to " + std::to_string(mparams->n_gpu_layers) + ", abort"); + } + if (nd > 1) { + if (!tensor_split) { + throw std::runtime_error("did not provide a buffer to write the tensor_split to, abort"); + } + if (mparams->tensor_split) { + for (size_t id = 0; id < nd; id++) { + if (mparams->tensor_split[id] != 0.0f) { + throw std::runtime_error("model_params::tensor_split already set by user, abort"); + } + } + } + if (mparams->split_mode == LLAMA_SPLIT_MODE_ROW) { + throw std::runtime_error("changing weight allocation for LLAMA_SPLIT_MODE_ROW not implemented, abort"); + } + if (hp_ngl < 2*nd) { + throw std::runtime_error("model has only " + std::to_string(hp_ngl) + " layers but need at least " + + std::to_string(2*nd) + " to fit memory for " + std::to_string(nd) + " devices, abort"); + } + } + if (!tensor_buft_overrides) { + throw std::runtime_error("did not provide buffer to set tensor_buft_overrides, abort"); + } + if (mparams->tensor_buft_overrides && (mparams->tensor_buft_overrides->pattern || mparams->tensor_buft_overrides->buft)) { + throw std::runtime_error("model_params::tensor_buft_overrides already set by user, abort"); + } + + // step 3: iteratively fill the devices front to back with model layers + // - for a dense model simply fill full layers, giving each device a contiguous slice of the model + // - for a MoE model, first fill the back with dense-only layers, then convert those to layers in the front + // - to reduce wasted VRAM, the memory from each device can "overflow" to the next device (or RAM for the last one) + + enum layer_fraction_t { + LAYER_FRACTION_NONE = 0, // nothing + LAYER_FRACTION_ATTN = 1, // attention + LAYER_FRACTION_UP = 2, // attention + up + LAYER_FRACTION_GATE = 3, // attention + up + gate + LAYER_FRACTION_MOE = 4, // everything but sparse MoE weights + }; + + // utility function that returns a static C string matching the tensors for a specific layer index and layer fraction: + auto get_overflow_pattern = [&](const size_t il, const layer_fraction_t lf) -> const char * { + switch (lf) { + case LAYER_FRACTION_ATTN: { + static std::vector patterns; + while (patterns.size() <= il) { + patterns.push_back("blk\\." + std::to_string(patterns.size()) + "\\.ffn_(up|gate|down).*"); + } + return patterns[il].c_str(); + } + case LAYER_FRACTION_UP: { + static std::vector patterns; + while (patterns.size() <= il) { + patterns.push_back("blk\\." + std::to_string(patterns.size()) + "\\.ffn_(gate|down).*"); + } + return patterns[il].c_str(); + } + case LAYER_FRACTION_GATE: { + static std::vector patterns; + while (patterns.size() <= il) { + patterns.push_back("blk\\." + std::to_string(patterns.size()) + "\\.ffn_down.*"); + } + return patterns[il].c_str(); + } + case LAYER_FRACTION_MOE: { + static std::vector patterns; + while (patterns.size() <= il) { + patterns.push_back("blk\\." + std::to_string(patterns.size()) + "\\.ffn_(up|down|gate)_(ch|)exps"); + } + return patterns[il].c_str(); + } + default: + GGML_ABORT("fatal error"); + return nullptr; + } + }; + + struct ngl_t { + uint32_t il_full_start = 0; // layer index for device at which the full layers start + uint32_t il_part_start = 0; // layer index at which the full layers stop and the partial layers start + uint32_t il_stop = 0; // layer index at which the layers stop + + // the first partial layer can have varying parts overflow, all further layers are LAYER_FRACTION_MOE: + layer_fraction_t overflow_type = LAYER_FRACTION_NONE; + }; + + const size_t ntbo = llama_max_tensor_buft_overrides(); + + std::vector overflow_bufts; + overflow_bufts.reserve(nd); + for (size_t id = 1; id < nd; ++id) { + overflow_bufts.push_back(ggml_backend_dev_buffer_type(devs[id])); + } + overflow_bufts.push_back(ggml_backend_cpu_buffer_type()); + + // utility function to set n_gpu_layers and tensor_split + auto set_ngl_tensor_split_tbo = [&](const std::vector & ngl_per_device, llama_model_params & mparams) { + mparams.n_gpu_layers = 0; + for (size_t id = 0; id < nd; id++) { + assert(ngl_per_device[id].il_stop >= ngl_per_device[id].il_full_start); + const uint32_t n_layer = ngl_per_device[id].il_stop - ngl_per_device[id].il_full_start; + mparams.n_gpu_layers += n_layer; + if (nd > 1) { + tensor_split[id] = ngl_per_device[id].il_stop - ngl_per_device[id].il_full_start; + } + } + if (hp_nex > 0 && nd > 1) { + tensor_split[nd - 1] += 1; // non-repeating tensors + } + mparams.tensor_split = tensor_split; + + size_t itbo = 0; + for (size_t id = 0; id < nd; id++) { + // on last device one of the "full layers" are the non-repeating layers + for (uint32_t il = ngl_per_device[id].il_part_start; il < ngl_per_device[id].il_stop; il++) { + if (itbo + 1 >= ntbo) { + tensor_buft_overrides[itbo].pattern = nullptr; + tensor_buft_overrides[itbo].buft = nullptr; + itbo++; + mparams.tensor_buft_overrides = tensor_buft_overrides; + throw std::runtime_error("llama_params_fit_n_tensor_buft_overrides() == " + + std::to_string(ntbo) + " is insufficient for model\n"); + } + tensor_buft_overrides[itbo].pattern = get_overflow_pattern(il, + il == ngl_per_device[id].il_part_start ? ngl_per_device[id].overflow_type : LAYER_FRACTION_MOE); + tensor_buft_overrides[itbo].buft = overflow_bufts[id]; + itbo++; + } + } + tensor_buft_overrides[itbo].pattern = nullptr; + tensor_buft_overrides[itbo].buft = nullptr; + itbo++; + mparams.tensor_buft_overrides = tensor_buft_overrides; + }; + + // utility function that returns the memory use per device for given numbers of layers per device + auto get_memory_for_layers = [&](const char * func_name, const std::vector & ngl_per_device) -> std::vector { + assert(ngl_per_device[0].il_full_start == 0); + assert(ngl_per_device.back().il_stop <= hp_ngl); + llama_model_params mparams_copy = *mparams; + set_ngl_tensor_split_tbo(ngl_per_device, mparams_copy); + + const dmds_t dmd_nl = llama_get_device_memory_data( + path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, hp_nldl, log_level); + + LLAMA_LOG_DEBUG("%s: memory for test allocation by device:\n", func_name); + for (size_t id = 0; id < nd; id++) { + const ngl_t & n = ngl_per_device[id]; + LLAMA_LOG_DEBUG( + "%s: id=%zu, il_full_start=%2" PRIu32 ", il_part_start=%2" PRIu32 ", " + "il_stop=%2" PRIu32 ", overflow_type=%d, mem=%6" PRId64 " MiB\n", + func_name, id, n.il_full_start, n.il_part_start, n.il_stop, int(n.overflow_type), dmd_nl[id].mb.total()/MiB); + } + + std::vector ret; + ret.reserve(nd); + for (const llama_device_memory_data & dmd : dmd_nl) { + ret.push_back(dmd.mb.total()); + } + return ret; + }; + + int64_t global_surplus_cpu_moe = 0; + if (hp_nex > 0) { + const static std::string pattern_moe_all = "blk\\.\\d+\\.ffn_(up|down|gate)_(ch|)exps"; // matches all MoE tensors + ggml_backend_buffer_type_t cpu_buft = ggml_backend_cpu_buffer_type(); + tensor_buft_overrides[0] = {pattern_moe_all.c_str(), cpu_buft}; + tensor_buft_overrides[1] = {nullptr, nullptr}; + mparams->tensor_buft_overrides = tensor_buft_overrides; + + LLAMA_LOG_DEBUG("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__); + const dmds_t dmds_cpu_moe = llama_get_device_memory_data( + path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, hp_nldl, log_level); + + // reset + tensor_buft_overrides[0] = {nullptr, nullptr}; + mparams->tensor_buft_overrides = tensor_buft_overrides; + + for (const llama_device_memory_data & dmd : dmds_cpu_moe) { + global_surplus_cpu_moe += dmd.free; + global_surplus_cpu_moe -= int64_t(dmd.mb.total()) + margin; + } + } + + std::vector targets; // maximum acceptable memory use per device + targets.reserve(nd); + for (size_t id = 0; id < nd; id++) { + targets.push_back(dmds_full[id].free - margin); + } + + std::vector mem; + std::vector ngl_per_device(nd); + ngl_per_device.back().overflow_type = LAYER_FRACTION_MOE; + if (hp_nex > 0) { + if (global_surplus_cpu_moe >= 0) { + LLAMA_LOG_INFO("%s: with only dense weights in device memory there is a total surplus of %" PRId64 " MiB\n", + __func__, global_surplus_cpu_moe/MiB); + ngl_per_device.back().il_stop = hp_ngl; + } else { + LLAMA_LOG_INFO("%s: with only dense weights in device memory there is still a total deficit of %" PRId64 " MiB\n", + __func__, -global_surplus_cpu_moe/MiB); + } + } + + // utility function that iteratively fills up devices using the false position method + auto distribute_layers = [&](const char * func_name) { + LLAMA_LOG_INFO("%s: distributing layers across devices with overflow to next device/system memory:\n", func_name); + size_t id = 0; + + // dense: iterate over all devices, fill front to back with full layers + // MoE: iterate over all but the last device, convert dense-only layers from last device to full layers + const size_t id_stop = hp_nex == 0 ? nd : nd - 1; + for (; id < id_stop; id++) { + std::vector ngl_per_device_high = ngl_per_device; + { + const uint32_t step_size = hp_nex == 0 ? + (id == 0 ? hp_ngl : hp_ngl - ngl_per_device[id-1].il_stop) : + ngl_per_device.back().il_stop - ngl_per_device.back().il_full_start; + ngl_per_device_high[id].il_part_start += step_size; + ngl_per_device_high[id].il_stop += step_size; + size_t jd = id + 1; + for (; jd < id_stop; jd++) { + ngl_per_device_high[jd].il_part_start += step_size; + ngl_per_device_high[jd].il_full_start += step_size; + ngl_per_device_high[jd].il_stop += step_size; + } + if (jd < nd) { + ngl_per_device_high[jd].il_part_start += step_size; + ngl_per_device_high[jd].il_full_start += step_size; + } + } + std::vector mem_high = get_memory_for_layers(func_name, ngl_per_device_high); + mem = get_memory_for_layers(func_name, ngl_per_device); + + if (mem_high[id] < targets[id]) { + ngl_per_device = ngl_per_device_high; + } else if (mem[id] < targets[id]) { + assert(ngl_per_device_high[id].il_stop > ngl_per_device[id].il_stop); + uint32_t delta = ngl_per_device_high[id].il_stop - ngl_per_device[id].il_stop; + while (delta > 1) { + uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); + step_size = std::max(step_size, uint32_t(1)); + step_size = std::min(step_size, delta - 1); + + std::vector ngl_per_device_test = ngl_per_device; + ngl_per_device_test[id].il_part_start += step_size; + ngl_per_device_test[id].il_stop += step_size; + size_t jd = id + 1; + for (; jd < id_stop; jd++) { + ngl_per_device_test[jd].il_part_start += step_size; + ngl_per_device_test[jd].il_full_start += step_size; + ngl_per_device_test[jd].il_stop += step_size; + } + if (jd < nd) { + ngl_per_device_test[jd].il_part_start += step_size; + ngl_per_device_test[jd].il_full_start += step_size; + } + const std::vector mem_test = get_memory_for_layers(func_name, ngl_per_device_test); + + if (mem_test[id] <= targets[id]) { + ngl_per_device = ngl_per_device_test; + mem = mem_test; + } else { + ngl_per_device_high = ngl_per_device_test; + mem_high = mem_test; + } + delta = ngl_per_device_high[id].il_stop - ngl_per_device[id].il_stop; + } + } + + // try to fit at least part of one more layer + std::vector ngl_per_device_test = ngl_per_device; + ngl_per_device_test[id].il_stop++; + size_t jd = id + 1; + for (; jd < id_stop; jd++) { + ngl_per_device_test[jd].il_part_start++; + ngl_per_device_test[jd].il_full_start++; + ngl_per_device_test[jd].il_stop++; + } + if (jd < nd) { + ngl_per_device_test[jd].il_part_start++; + ngl_per_device_test[jd].il_full_start++; + } + for (layer_fraction_t lf : {LAYER_FRACTION_ATTN, LAYER_FRACTION_UP, LAYER_FRACTION_GATE}) { + ngl_per_device_test[id].overflow_type = lf; + std::vector mem_test = get_memory_for_layers(func_name, ngl_per_device_test); + if (mem_test[id] > targets[id]) { + break; + } + ngl_per_device = ngl_per_device_test; + mem = mem_test; + } + + const int64_t projected_margin = dmds_full[id].free - mem[id]; + const uint32_t n_layer = ngl_per_device[id].il_stop - ngl_per_device[id].il_full_start; + const uint32_t n_partial = ngl_per_device[id].il_stop - ngl_per_device[id].il_part_start; + LLAMA_LOG_INFO( + "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", + func_name, dev_names[id].c_str(), n_layer, n_partial, mem[id]/MiB, projected_margin/MiB); + } + + if (id >= nd) { + return; // we already processed all devices + } + + if (mem.empty()) { + mem = get_memory_for_layers(func_name, ngl_per_device); + } + + std::vector ngl_per_device_high = ngl_per_device; + ngl_per_device_high[id].il_part_start = ngl_per_device_high[id].il_stop; + std::vector mem_high = get_memory_for_layers(func_name, ngl_per_device_high); + uint32_t delta = ngl_per_device_high[id].il_part_start - ngl_per_device[id].il_part_start; + while (delta > 1) { + uint32_t step_size = int64_t(delta) * (targets[id] - mem[id]) / (mem_high[id] - mem[id]); + step_size = std::max(step_size, uint32_t(1)); + step_size = std::min(step_size, delta - 1); + + std::vector ngl_per_device_test = ngl_per_device; + ngl_per_device_test[id].il_part_start += step_size; + const std::vector mem_test = get_memory_for_layers(func_name, ngl_per_device_test); + + if (mem_test[id] <= targets[id]) { + ngl_per_device = ngl_per_device_test; + mem = mem_test; + } else { + ngl_per_device_high = ngl_per_device_test; + mem_high = mem_test; + } + delta = ngl_per_device_high[id].il_part_start - ngl_per_device[id].il_part_start; + } + + if (ngl_per_device[id].il_stop > ngl_per_device[id].il_part_start) { + // try to fit at least part of one more layer + std::vector ngl_per_device_test = ngl_per_device; + ngl_per_device_test[id].il_part_start++; + for (layer_fraction_t lf : {LAYER_FRACTION_UP, LAYER_FRACTION_GATE}) { + ngl_per_device_test[id].overflow_type = lf; + std::vector mem_test = get_memory_for_layers(func_name, ngl_per_device_test); + if (mem_test[id] > targets[id]) { + break; + } + ngl_per_device = ngl_per_device_test; + mem = mem_test; + } + } + + const int64_t projected_margin = dmds_full[id].free - mem[id]; + const uint32_t n_layer = ngl_per_device[id].il_stop - ngl_per_device[id].il_full_start; + const uint32_t n_partial = ngl_per_device[id].il_stop - ngl_per_device[id].il_part_start; + LLAMA_LOG_INFO( + "%s: - %s: %2" PRIu32 " layers (%2" PRIu32 " overflowing), %6" PRId64 " MiB used, %6" PRId64 " MiB free\n", + func_name, dev_names[id].c_str(), n_layer, n_partial, mem[id]/MiB, projected_margin/MiB); + }; + + distribute_layers(__func__); + set_ngl_tensor_split_tbo(ngl_per_device, *mparams); +} + +bool llama_params_fit( + const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, + float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, + size_t margin_s, uint32_t n_ctx_min, enum ggml_log_level log_level) { + const int64_t t0_us = llama_time_us(); + bool ok = true; + try { + llama_params_fit_impl(path_model, mparams, cparams, tensor_split, tensor_buft_overrides, margin_s, n_ctx_min, log_level); + LLAMA_LOG_INFO("%s: successfully fit params to free device memory\n", __func__); + } catch (const std::runtime_error & e) { + LLAMA_LOG_WARN("%s: failed to fit params to free device memory: %s\n", __func__, e.what()); + ok = false; + } + const int64_t t1_us = llama_time_us(); + LLAMA_LOG_INFO("%s: fitting params to free memory took %.2f seconds\n", __func__, (t1_us - t0_us) * 1e-6); + return ok; +} + struct llama_sampler_chain_params llama_sampler_chain_default_params() { struct llama_sampler_chain_params result = { /*.no_perf =*/ true, @@ -49,6 +640,10 @@ size_t llama_max_devices(void) { return 16; } +size_t llama_max_tensor_buft_overrides() { + return 4096; +} + bool llama_supports_mmap(void) { return llama_mmap::SUPPORTED; } @@ -108,11 +703,12 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.no_alloc, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); model.hparams.vocab_only = params.vocab_only; + model.hparams.no_alloc = params.no_alloc; try { model.load_arch(ml); diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index d64956b843851..63800e23c6932 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -36,4 +36,5 @@ else() add_subdirectory(cvector-generator) add_subdirectory(export-lora) endif() + add_subdirectory(fit-params) endif() diff --git a/tools/fit-params/CMakeLists.txt b/tools/fit-params/CMakeLists.txt new file mode 100644 index 0000000000000..34c3373f83c04 --- /dev/null +++ b/tools/fit-params/CMakeLists.txt @@ -0,0 +1,8 @@ +set(TARGET llama-fit-params) +add_executable(${TARGET} fit-params.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) + +if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) +endif() diff --git a/tools/fit-params/README.md b/tools/fit-params/README.md new file mode 100644 index 0000000000000..8f0c958a2ffdd --- /dev/null +++ b/tools/fit-params/README.md @@ -0,0 +1,55 @@ +# fit-params + +llama.cpp binaries can automatically fit the projected memory use of a model to the free device memory available at runtime, +this is controlled using the CLI arguments starting with `-fit`/`--fit`. +Internally the code is calling `llama_params_fit` to adjust the `llama_model_params` and `llama_context_params` structs. +`llama-fit-params` is a simple utility that prints the CLI arguments corresponding to these adjustments to stdout. +Example usage: + +``` bash +# First, run llama-fit-params and store the results in a file: +> ./build/bin/llama-fit-params --model /opt/models/qwen_3-30b3a-f16.gguf | tee args.txt +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes +build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu +llama_params_fit_impl: projected to use 61807 MiB of device memory vs. 24077 MiB of free device memory +llama_params_fit_impl: cannot fulfill margin of 1024 MiB, need to reduce device memory by 42444 MiB +llama_params_fit_impl: context size reduced from 40960 to 4096 -> need 3456 MiB less memory in total +llama_params_fit_impl: with only dense weights in device memory there is a total surplus of 16164 MiB +llama_params_fit_impl: distributing layers across devices with overflow to next device/system memory: +llama_params_fit_impl: - CUDA0 (NVIDIA GeForce RTX 4090): 48 layers (34 overflowing), 19187 MiB used, 1199 MiB free +llama_params_fit: successfully fit params to free device memory +llama_params_fit: fitting params to free memory took 1.15 seconds +Printing fitted CLI arguments to stdout... +-c 4096 -ngl 48 -ot blk\.14\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.15\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.16\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.17\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.18\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.19\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.20\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.21\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.22\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.23\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.24\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.25\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.26\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.27\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.28\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.29\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.30\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.31\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.32\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.33\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.34\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.35\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.36\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.37\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.38\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.39\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.40\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.41\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.42\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.43\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.44\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.45\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.46\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.47\.ffn_(up|down|gate)_(ch|)exps=CPU + +# Next, use those results for a llama.cpp binary: +> cat args.txt | xargs ./build/bin/llama-server --model /opt/models/qwen_3-30b3a-f16.gguf +ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no +ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no +ggml_cuda_init: found 1 CUDA devices: + Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes +build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu +system info: n_threads = 16, n_threads_batch = 16, total_threads = 32 + +system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 | + +main: binding port with default address family +main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31 +main: loading model +srv load_model: loading model '/opt/models/qwen_3-30b3a-f16.gguf' +llama_params_fit_impl: projected to use 19187 MiB of device memory vs. 24077 MiB of free device memory +llama_params_fit_impl: will leave 1199 >= 1024 MiB of free device memory, no changes needed +llama_params_fit: successfully fit params to free device memory +llama_params_fit: fitting params to free memory took 0.28 seconds +[...] +main: server is listening on http://127.0.0.1:8080 - starting the main loop +srv update_slots: all slots are idle +^Csrv operator(): operator(): cleaning up before exit... + +llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted | +llama_memory_breakdown_print: | - CUDA0 (RTX 4090) | 24077 = 945 + (19187 = 17904 + 384 + 898) + 3945 | +llama_memory_breakdown_print: | - Host | 58271 = 58259 + 0 + 12 | +``` diff --git a/tools/fit-params/fit-params.cpp b/tools/fit-params/fit-params.cpp new file mode 100644 index 0000000000000..e34fc0abe9ebe --- /dev/null +++ b/tools/fit-params/fit-params.cpp @@ -0,0 +1,62 @@ +#include "llama.h" + +#include "arg.h" +#include "common.h" +#include "log.h" + +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +int main(int argc, char ** argv) { + common_params params; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) { + return 1; + } + + common_init(); + llama_backend_init(); + llama_numa_init(params.numa); + auto mparams = common_model_params_to_llama(params); + auto cparams = common_context_params_to_llama(params); + llama_params_fit(params.model.path.c_str(), &mparams, &cparams, + params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_margin, params.fit_params_min_ctx, + params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR); + + LOG_INF("Printing fitted CLI arguments to stdout...\n"); + std::cout << "-c " << cparams.n_ctx; + std::cout << " -ngl " << mparams.n_gpu_layers; + + size_t nd = llama_max_devices(); + while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) { + nd--; + } + if (nd > 1) { + for (size_t id = 0; id < nd; id++) { + if (id == 0) { + std::cout << " -ts "; + } + if (id > 0) { + std::cout << ","; + } + std::cout << mparams.tensor_split[id]; + } + } + + const size_t ntbo = llama_max_tensor_buft_overrides(); + for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) { + if (itbo == 0) { + std::cout << " -ot "; + } + if (itbo > 0) { + std::cout << ","; + } + std::cout << mparams.tensor_buft_overrides[itbo].pattern << "=" << ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft); + } + std::cout << "\n"; + + return 0; +}