Skip to content

Commit 596a297

Browse files
llama-fit-params tool
1 parent 63c37ba commit 596a297

File tree

6 files changed

+128
-7
lines changed

6 files changed

+128
-7
lines changed

common/common.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -952,15 +952,9 @@ struct common_init_result common_init_from_params(common_params & params) {
952952
auto cparams = common_context_params_to_llama(params);
953953

954954
if (params.fit_params) {
955-
const bool fit_successful = llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
955+
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
956956
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_margin, params.fit_params_min_ctx,
957957
params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
958-
959-
if (fit_successful) {
960-
LOG_INF("%s: successfully fit parameters to device memory\n", __func__);
961-
} else {
962-
LOG_WRN("%s: failed to fit parameters to device memory, may crash during allocation\n", __func__);
963-
}
964958
}
965959

966960
llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ enum llama_example {
8989
LLAMA_EXAMPLE_TTS,
9090
LLAMA_EXAMPLE_DIFFUSION,
9191
LLAMA_EXAMPLE_FINETUNE,
92+
LLAMA_EXAMPLE_FIT_PARAMS,
9293

9394
LLAMA_EXAMPLE_COUNT,
9495
};

tools/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,5 @@ else()
3636
add_subdirectory(cvector-generator)
3737
add_subdirectory(export-lora)
3838
endif()
39+
add_subdirectory(fit-params)
3940
endif()

tools/fit-params/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
set(TARGET llama-fit-params)
2+
add_executable(${TARGET} fit-params.cpp)
3+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
4+
target_compile_features(${TARGET} PRIVATE cxx_std_17)
5+
6+
if(LLAMA_TOOLS_INSTALL)
7+
install(TARGETS ${TARGET} RUNTIME)
8+
endif()

tools/fit-params/README.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# fit-params
2+
3+
llama.cpp binaries can automatically fit the projected memory use of a model to the free device memory available at runtime,
4+
this is controlled using the CLI arguments starting with `-fit`/`--fit`.
5+
Internally the code is calling `llama_params_fit` to adjust the `llama_model_params` and `llama_context_params` structs.
6+
`llama-fit-params` is a simple utility that prints the CLI arguments corresponding to these adjustments to stdout.
7+
Example usage:
8+
9+
``` bash
10+
# First, run llama-fit-params and store the results in a file:
11+
> ./build/bin/llama-fit-params --model /opt/models/qwen_3-30b3a-f16.gguf | tee args.txt
12+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
13+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
14+
ggml_cuda_init: found 1 CUDA devices:
15+
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
16+
build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
17+
llama_params_fit_impl: projected to use 61807 MiB of device memory vs. 24077 MiB of free device memory
18+
llama_params_fit_impl: cannot fulfill margin of 1024 MiB, need to reduce device memory by 42444 MiB
19+
llama_params_fit_impl: context size reduced from 40960 to 4096 -> need 3456 MiB less memory in total
20+
llama_params_fit_impl: with only dense weights in device memory there is a total surplus of 16164 MiB
21+
llama_params_fit_impl: distributing layers across devices with overflow to next device/system memory:
22+
llama_params_fit_impl: - CUDA0 (NVIDIA GeForce RTX 4090): 48 layers (34 overflowing), 19187 MiB used, 1199 MiB free
23+
llama_params_fit: successfully fit params to free device memory
24+
llama_params_fit: fitting params to free memory took 1.15 seconds
25+
Printing fitted CLI arguments to stdout...
26+
-c 4096 -ngl 48 -ot blk\.14\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.15\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.16\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.17\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.18\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.19\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.20\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.21\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.22\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.23\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.24\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.25\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.26\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.27\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.28\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.29\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.30\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.31\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.32\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.33\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.34\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.35\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.36\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.37\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.38\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.39\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.40\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.41\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.42\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.43\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.44\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.45\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.46\.ffn_(up|down|gate)_(ch|)exps=CPU,blk\.47\.ffn_(up|down|gate)_(ch|)exps=CPU
27+
28+
# Next, use those results for a llama.cpp binary:
29+
> cat args.txt | xargs ./build/bin/llama-server --model /opt/models/qwen_3-30b3a-f16.gguf
30+
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
31+
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
32+
ggml_cuda_init: found 1 CUDA devices:
33+
Device 0: NVIDIA GeForce RTX 4090, compute capability 8.9, VMM: yes
34+
build: 6895 (4341dc8bc) with cc (GCC) 15.2.1 20250813 for x86_64-pc-linux-gnu
35+
system info: n_threads = 16, n_threads_batch = 16, total_threads = 32
36+
37+
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 890 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX_VNNI = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
38+
39+
main: binding port with default address family
40+
main: HTTP server is listening, hostname: 127.0.0.1, port: 8080, http threads: 31
41+
main: loading model
42+
srv load_model: loading model '/opt/models/qwen_3-30b3a-f16.gguf'
43+
llama_params_fit_impl: projected to use 19187 MiB of device memory vs. 24077 MiB of free device memory
44+
llama_params_fit_impl: will leave 1199 >= 1024 MiB of free device memory, no changes needed
45+
llama_params_fit: successfully fit params to free device memory
46+
llama_params_fit: fitting params to free memory took 0.28 seconds
47+
[...]
48+
main: server is listening on http://127.0.0.1:8080 - starting the main loop
49+
srv update_slots: all slots are idle
50+
^Csrv operator(): operator(): cleaning up before exit...
51+
52+
llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
53+
llama_memory_breakdown_print: | - CUDA0 (RTX 4090) | 24077 = 945 + (19187 = 17904 + 384 + 898) + 3945 |
54+
llama_memory_breakdown_print: | - Host | 58271 = 58259 + 0 + 12 |
55+
```

tools/fit-params/fit-params.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#include "llama.h"
2+
3+
#include "arg.h"
4+
#include "common.h"
5+
#include "log.h"
6+
7+
#include <iostream>
8+
9+
#if defined(_MSC_VER)
10+
#pragma warning(disable: 4244 4267) // possible loss of data
11+
#endif
12+
13+
int main(int argc, char ** argv) {
14+
common_params params;
15+
16+
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
17+
return 1;
18+
}
19+
20+
common_init();
21+
llama_backend_init();
22+
llama_numa_init(params.numa);
23+
auto mparams = common_model_params_to_llama(params);
24+
auto cparams = common_context_params_to_llama(params);
25+
llama_params_fit(params.model.path.c_str(), &mparams, &cparams,
26+
params.tensor_split, params.tensor_buft_overrides.data(), params.fit_params_margin, params.fit_params_min_ctx,
27+
params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_ERROR);
28+
29+
LOG_INF("Printing fitted CLI arguments to stdout...\n");
30+
std::cout << "-c " << cparams.n_ctx;
31+
std::cout << " -ngl " << mparams.n_gpu_layers;
32+
33+
size_t nd = llama_max_devices();
34+
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {
35+
nd--;
36+
}
37+
if (nd > 1) {
38+
for (size_t id = 0; id < nd; id++) {
39+
if (id == 0) {
40+
std::cout << " -ts ";
41+
}
42+
if (id > 0) {
43+
std::cout << ",";
44+
}
45+
std::cout << mparams.tensor_split[id];
46+
}
47+
}
48+
49+
const size_t ntbo = llama_max_tensor_buft_overrides();
50+
for (size_t itbo = 0; itbo < ntbo && mparams.tensor_buft_overrides[itbo].pattern != nullptr; itbo++) {
51+
if (itbo == 0) {
52+
std::cout << " -ot ";
53+
}
54+
if (itbo > 0) {
55+
std::cout << ",";
56+
}
57+
std::cout << mparams.tensor_buft_overrides[itbo].pattern << "=" << ggml_backend_buft_name(mparams.tensor_buft_overrides[itbo].buft);
58+
}
59+
std::cout << "\n";
60+
61+
return 0;
62+
}

0 commit comments

Comments
 (0)