Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2790,14 +2790,8 @@ struct clip_model_loader {
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
// ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
// the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
// but we set a lower value to avoid OOM
// TODO: make it configurable by user
// TODO (2): bbox coordinates become inaccurate with small number of tokens,
// therefore we need to increase the min_tokens
// see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
hparams.set_limit_image_tokens(8, 2048);
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
hparams.set_limit_image_tokens(8, 4096);
hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
if (hparams.image_min_pixels < warn_min_pixels) {
LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
Expand Down Expand Up @@ -4813,7 +4807,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN3VL:
{
const int merge_ratio = 2;
const int merge_ratio = hparams.n_merge;
const int pw = image_size_width / patch_size;
const int ph = image_size_height / patch_size;
std::vector<int> positions(n_pos * 4);
Expand Down
Loading