From 6427ce94dae40967d72b3ead3f44a267cb4da0e8 Mon Sep 17 00:00:00 2001 From: Ivy233 Date: Thu, 20 Mar 2025 14:03:16 +0800 Subject: [PATCH 1/2] [Fix] Compiling clip-quantize-cli and running it in a CUDA environment will cause ggml_fp16_to_fp32 to report an error when trying to access video memory. You need to switch to the CPU backend to run quantize. After the fix, it will automatically run in the CPU backend and will no longer be bound to CUDA. --- examples/llava/clip.cpp | 6 +++--- examples/llava/clip.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a1f050e39a094..ae39ab52d284e 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1370,9 +1370,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { +struct clip_ctx * clip_model_load(const char * fname, const int verbosity, const bool use_gpu) { return clip_init(fname, clip_context_params{ - /* use_gpu */ true, + /* use_gpu */ use_gpu, /* verbosity */ verbosity, }); } @@ -2989,7 +2989,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i assert(itype < GGML_TYPE_COUNT); ggml_type type = static_cast(itype); - auto * ctx_clip = clip_model_load(fname_inp, 2); + auto * ctx_clip = clip_model_load(fname_inp, 2, false); const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 47059ca1b9f78..f23f1d7046cfa 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -45,7 +45,7 @@ struct clip_context_params { }; // deprecated, use clip_init -CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); +CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity=1, const bool use_gpu=true); CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params); From 3e181163f2e26bbe10c56bfe725c4252e868c2f3 Mon Sep 17 00:00:00 2001 From: Ivy233 Date: Wed, 26 Mar 2025 11:07:51 +0800 Subject: [PATCH 2/2] [Fix]Roll back the signature and implementation of clip_model_load, and change the call in clip_model_quantize to clip_init. --- examples/llava/clip.cpp | 9 ++++++--- examples/llava/clip.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ae39ab52d284e..58ee5cf0174b2 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -1370,9 +1370,9 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } // read and create ggml_context containing the tensors and their data -struct clip_ctx * clip_model_load(const char * fname, const int verbosity, const bool use_gpu) { +struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { return clip_init(fname, clip_context_params{ - /* use_gpu */ use_gpu, + /* use_gpu */ true, /* verbosity */ verbosity, }); } @@ -2989,7 +2989,10 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i assert(itype < GGML_TYPE_COUNT); ggml_type type = static_cast(itype); - auto * ctx_clip = clip_model_load(fname_inp, 2, false); + auto * ctx_clip = clip_init(fname_inp, clip_context_params{ + /* use_gpu */ false, + /* verbosity */ 2, + }); const auto & ctx_src = ctx_clip->ctx_gguf; const auto & ctx_data = ctx_clip->ctx_data; diff --git a/examples/llava/clip.h b/examples/llava/clip.h index f23f1d7046cfa..47059ca1b9f78 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -45,7 +45,7 @@ struct clip_context_params { }; // deprecated, use clip_init -CLIP_API struct clip_ctx * clip_model_load(const char * fname, const int verbosity=1, const bool use_gpu=true); +CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity); CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);