Merge pull request #31 from rujialiu/mrope-fix

ngxson · web-flow · commit ef1af68f4b34 · 2025-10-28T15:56:21.000+01:00
Mrope fix
diff --git a/include/llama.h b/include/llama.h
@@ -232,7 +232,7 @@ extern "C" {
 
         llama_token  *  token;
         float        *  embd;
-        llama_pos    *  pos;
+        llama_pos    *  pos;      // first `n_tokens` elements are always linearly increasing position for traditional llm
         int32_t      *  n_seq_id;
         llama_seq_id ** seq_id;
         int8_t       *  logits;   // TODO: rename this to "output"
diff --git a/src/llama-batch.cpp b/src/llama-batch.cpp
@@ -259,23 +259,7 @@ bool llama_batch_allocr::init(
         const llama_pos p0 = memory ? memory->seq_pos_max(s) : -1;
 
         if (p0 >= 0) {
-            bool ok = true;
-
-            if (batch.token) {
-                if (seq_pos_min(s) != p0 + 1) {
-                    ok = false;
-                }
-            } else {
-                assert(batch.embd);
-
-                // for embeddings (typically used as vision input), we allow them to have repeating positions
-                // ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
-                if (seq_pos_min(s) != p0 && seq_pos_min(s) != p0 + 1) {
-                    ok = false;
-                }
-            }
-
-            if (!ok) {
+            if (seq_pos_min(s) != p0 + 1) {
                 LLAMA_LOG_ERROR(
                         "%s: the tokens of sequence %d in the input batch have inconsistent sequence positions:\n"
                         " - the last position stored in the memory module of the context (i.e. the KV cache) for sequence %d is X = %d\n"
@@ -655,7 +639,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
 
     auto udata = std::make_shared<llama_ubatch::data_t>();
 
-    const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
+    const int32_t n_pos_cur = batch.embd ? (n_pos_per_embd + 1) : 1;
 
     const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
     const int64_t n_pos_all  =              (int64_t) n_tokens*n_pos_cur;
@@ -681,7 +665,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
         }
 
         for (int j = 0; j < n_pos_cur; ++j) {
-            udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
+            udata->pos[j * n_tokens + i] = batch.pos[j * batch.n_tokens + idxs[i]];
         }
 
         udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -54,7 +54,13 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
             }
             ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
         } else {
-            ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
+            llama_pos * pos_ptr = ubatch->pos;
+            // Normally, ubatch->pos stores linearly increasing position
+            // However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)
+            // But linearly increasing position is still needed for proper causal attention masking
+            // So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.
+            if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions
+            ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));
         }
     }
 }
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
@@ -55,6 +55,11 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) {
 
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
+// notes2: Normally, batch's `pos` stores linearly increasing position
+// However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)
+// But linearly increasing position is still needed for proper causal attention masking
+// So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.
+// So `pos` has `n_tokens * (n_pos_per_embd + 1)` elements
 struct decode_embd_batch {
     int n_pos_per_embd;
     int n_mmproj_embd;
@@ -66,7 +71,7 @@ struct decode_embd_batch {
     std::vector<int8_t>         logits;
     llama_batch batch;
     decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
-        pos     .resize(n_tokens * n_pos_per_embd);
+        pos     .resize(n_tokens * (n_pos_per_embd + 1));
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
         logits  .resize(n_tokens);
@@ -100,13 +105,14 @@ struct decode_embd_batch {
         for (int y = 0; y < ny; y++) {
             for (int x = 0; x < nx; x++) {
                 int i = y * nx + x;
-                pos[i                     ] = pos_0;
-                pos[i + batch.n_tokens    ] = pos_0 + y;
-                pos[i + batch.n_tokens * 2] = pos_0 + x;
-                pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+                pos[i + batch.n_tokens    ] = pos_0;
+                pos[i + batch.n_tokens * 2] = pos_0 + y;
+                pos[i + batch.n_tokens * 3] = pos_0 + x;
+                pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused
             }
         }
         for (int i = 0; i < batch.n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
             batch.n_seq_id[i] = 1;
             batch.seq_id  [i] = seq_id_0.data();
             batch.logits  [i] = false;
@@ -118,12 +124,13 @@ struct decode_embd_batch {
         GGML_ASSERT(n_pos_per_embd == 4);
         seq_id_0[0] = seq_id;
         for (int i = 0; i < batch.n_tokens; i++) {
-            pos[i                     ] = pos_0 + i;
             pos[i + batch.n_tokens    ] = pos_0 + i;
             pos[i + batch.n_tokens * 2] = pos_0 + i;
-            pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+            pos[i + batch.n_tokens * 3] = pos_0 + i;
+            pos[i + batch.n_tokens * 4] = 0; // last pos dim is unused
         }
         for (int i = 0; i < batch.n_tokens; i++) {
+            batch.pos     [i] = pos_0 + i;
             batch.n_seq_id[i] = 1;
             batch.seq_id  [i] = seq_id_0.data();
             batch.logits  [i] = false;
@@ -133,12 +140,12 @@ struct decode_embd_batch {
     llama_batch get_view(int offset, int n_tokens) {
         llama_pos * pos_ptr;
         pos_view.clear();
-        pos_view.reserve(n_tokens * n_pos_per_embd);
+        pos_view.reserve(n_tokens * (n_pos_per_embd + 1));
         if (n_pos_per_embd > 1) {
             // mrope
             // for example, with layout of src: 1234...1234...1234...1234...
             //       offset 2 will give us dst: 34...34...34...34...
-            for (int i = 0; i < n_pos_per_embd; i++) {
+            for (int i = 0; i <= n_pos_per_embd; i++) {
                 // assume n_tokens is less than or equal to batch.n_tokens
                 // batch.n_tokens is number of **total** tokens
                 // n_tokens is number of viewed token
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
@@ -1030,9 +1030,6 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
 }
 
 llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
-    if (image_tokens->use_mrope_pos) {
-        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
-    }
     return image_tokens->n_tokens();
 }
 

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,13 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {`
`54`	`54`	`}`
`55`	`55`	`ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));`
`56`	`56`	`} else {`
`57`		`- ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokensn_pos_per_embdggml_element_size(pos));`
	`57`	`+ llama_pos * pos_ptr = ubatch->pos;`
	`58`	`+ // Normally, ubatch->pos stores linearly increasing position`
	`59`	`+ // However, some multi-modal models requires special position embedding (e.g. M-Rope in qwen2vl and qwen2.5vl)`
	`60`	`+ // But linearly increasing position is still needed for proper causal attention masking`
	`61`	`+ // So we store both of them: the first n_tokens elements are not changed, while model-specific positions are appended after that.`
	`62`	`+ if (ubatch->embd && n_pos_per_embd > 1) pos_ptr += n_tokens; // use mrope positions`
	`63`	`+ ggml_backend_tensor_set(pos, pos_ptr, 0, n_tokens * n_pos_per_embd * ggml_element_size(pos));`
`58`	`64`	`}`
`59`	`65`	`}`
`60`	`66`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1030,9 +1030,6 @@ const char * mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {`
`1030`	`1030`	`}`
`1031`	`1031`
`1032`	`1032`	`llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {`
`1033`		`- if (image_tokens->use_mrope_pos) {`
`1034`		`- return 1; // for M-RoPE, the whole image is 1 in temporal dimension`
`1035`		`- }`
`1036`	`1033`	`return image_tokens->n_tokens();`
`1037`	`1034`	`}`
`1038`	`1035`