From f55b71f66b0f736b218368f2de6407194ff27a5b Mon Sep 17 00:00:00 2001 From: Pascal Date: Sun, 2 Nov 2025 07:39:42 +0100 Subject: [PATCH 1/3] server: add minimax-m2 reasoning format override for MiniMax-M2 compatibility MiniMax-M2 models require the complete ... block including tags to be present in the context for proper reasoning. This mode injects a synthetic opening tag in the stream while keeping all reasoning tags inline in message.content, ensuring the model receives the full reasoning block it needs. Changes: - Add COMMON_REASONING_FORMAT_MINIMAX_M2 enum value to common_reasoning_format - Implement minimax-m2 format parsing that bypasses reasoning extraction - Inject synthetic \n chunk at slot start when minimax-m2 is active - Track injection state with minimax_reasoning_prefix_injected slot flag - Prepend \n to generated_text for final response and chat parsing - Prevent client reasoning_format=auto from overriding server CLI setting - Add minimax-m2 to CLI help, README.md, and code documentation - Handle LLAMA_TOKEN_NULL in send_partial_response to skip token recording - Update process_token to preserve delta_to_send for streaming correctness --- common/arg.cpp | 1 + common/chat-parser.cpp | 3 ++- common/chat.cpp | 10 ++++++-- common/common.h | 1 + tools/server/README.md | 2 +- tools/server/server.cpp | 57 ++++++++++++++++++++++++++++++++++------- 6 files changed, 61 insertions(+), 13 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index d8f9bbd24301f..85e0018760688 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3442,6 +3442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- none: leaves thoughts unparsed in `message.content`\n" "- deepseek: puts thoughts in `message.reasoning_content`\n" "- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`\n" + "- minimax-m2: streams a synthetic opening `` and keeps `` tags in `message.content`\n" "(default: auto)", [](common_params & params, const std::string & value) { params.reasoning_format = common_reasoning_format_from_name(value); diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index ff83102788d49..f5728ef65bdb8 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -171,7 +171,8 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) { bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) { std::string pending_reasoning_prefix; - if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) { + if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE || + syntax_.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) { return false; } diff --git a/common/chat.cpp b/common/chat.cpp index 63583fb22489d..dff5f3c1f6579 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -655,6 +655,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) { case COMMON_REASONING_FORMAT_AUTO: return "auto"; case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy"; + case COMMON_REASONING_FORMAT_MINIMAX_M2: return "minimax-m2"; default: throw std::runtime_error("Unknown reasoning format"); } @@ -669,6 +670,8 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo return COMMON_REASONING_FORMAT_DEEPSEEK; } else if (format == "deepseek-legacy") { return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; + } else if (format == "minimax-m2") { + return COMMON_REASONING_FORMAT_MINIMAX_M2; } throw std::runtime_error("Unknown reasoning format: " + format); } @@ -1790,7 +1793,8 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) { // <|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|> common_chat_parse_deepseek_v3_1_content(builder); } else { - if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) { + if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || + builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) { LOG_DBG("%s: reasoning_format none, adding content\n", __func__); common_chat_parse_deepseek_v3_1_content(builder); return; @@ -2001,7 +2005,9 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { if (regex_match(analysis_regex, header)) { builder.move_to(header_start_pos); - if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) { + if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || + builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2 || + builder.syntax().reasoning_in_content) { builder.add_content(consume_end(true)); } else { builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>"); diff --git a/common/common.h b/common/common.h index a8cb630ea5805..7d326dc1069b0 100644 --- a/common/common.h +++ b/common/common.h @@ -249,6 +249,7 @@ enum common_reasoning_format { COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content` COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in tags in stream mode COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. + COMMON_REASONING_FORMAT_MINIMAX_M2, // Stream a synthetic opening tag and keep tags in `message.content` for MiniMax-M2 compatibility // do not extend this enum unless you absolutely have to // in most cases, use COMMON_REASONING_FORMAT_AUTO // see: https://github.com/ggml-org/llama.cpp/pull/15408 diff --git a/tools/server/README.md b/tools/server/README.md index c16d0bd6dcd7f..4cbdfe42a3e81 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: deepseek)
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
- minimax-m2: Stream a synthetic opening tag and keep tags in `message.content` for MiniMax-M2 compatibility
(default: deepseek)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 92d30664e41f4..05045b8384d51 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -443,7 +443,10 @@ struct server_task { } common_reasoning_format reasoning_format = params_base.reasoning_format; if (data.contains("reasoning_format")) { - reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); + const auto requested = common_reasoning_format_from_name(data.at("reasoning_format").get()); + if (requested != COMMON_REASONING_FORMAT_AUTO) { + reasoning_format = requested; + } } params.oaicompat_chat_syntax.reasoning_format = reasoning_format; params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); @@ -1660,6 +1663,7 @@ struct server_slot { bool has_next_token = true; bool has_new_line = false; bool truncated = false; + bool minimax_reasoning_prefix_injected = false; stop_type stop; @@ -1730,6 +1734,7 @@ struct server_slot { generated_text = ""; has_new_line = false; truncated = false; + minimax_reasoning_prefix_injected = false; stop = STOP_TYPE_NONE; stopping_word = ""; n_sent_text = 0; @@ -1856,9 +1861,13 @@ struct server_slot { GGML_ASSERT(task); auto previous_msg = chat_msg; - SRV_DBG("Parsing chat message: %s\n", generated_text.c_str()); + std::string text_to_parse = generated_text; + if (minimax_reasoning_prefix_injected) { + text_to_parse.insert(0, "\n"); + } + SRV_DBG("Parsing chat message: %s\n", text_to_parse.c_str()); auto new_msg = common_chat_parse( - generated_text, + text_to_parse, /* is_partial= */ stop != STOP_TYPE_EOS, task->params.oaicompat_chat_syntax); if (!new_msg.empty()) { @@ -2793,6 +2802,19 @@ struct server_context { slot.state = SLOT_STATE_STARTED; + const bool needs_minimax_prefix = + slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2; + if (needs_minimax_prefix) { + slot.minimax_reasoning_prefix_injected = true; + if (slot.task->params.stream) { + completion_token_output prefix_chunk{}; + prefix_chunk.tok = LLAMA_TOKEN_NULL; + prefix_chunk.prob = 0.0f; + prefix_chunk.text_to_send = "\n"; + send_partial_response(slot, prefix_chunk, false); + } + } + SLT_INF(slot, "%s", "processing task\n"); return true; @@ -2848,7 +2870,10 @@ struct server_context { result.text_to_send = ""; } + std::string delta_to_send = result.text_to_send; + result.text_to_send = token_str; slot.add_token(result); + result.text_to_send = std::move(delta_to_send); if (slot.task->params.stream) { send_partial_response(slot, result, false); } @@ -3021,7 +3046,11 @@ struct server_context { return true; } - void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) { + void send_partial_response( + server_slot & slot, + const completion_token_output & tkn, + bool is_progress, + const std::vector * forced_diffs = nullptr) { auto res = std::make_unique(); res->id = slot.task->id; @@ -3035,9 +3064,15 @@ struct server_context { res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000); } else { res->content = tkn.text_to_send; - res->tokens = { tkn.tok }; + if (tkn.tok != LLAMA_TOKEN_NULL) { + res->tokens = { tkn.tok }; + } - slot.update_chat_msg(res->oaicompat_msg_diffs); + if (forced_diffs) { + res->oaicompat_msg_diffs = *forced_diffs; + } else { + slot.update_chat_msg(res->oaicompat_msg_diffs); + } } res->n_decoded = slot.n_decoded; @@ -3050,7 +3085,7 @@ struct server_context { res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; // populate res.probs_output - if (slot.task->params.sampling.n_probs > 0) { + if (slot.task->params.sampling.n_probs > 0 && tkn.tok != LLAMA_TOKEN_NULL) { res->prob_output = tkn; // copy the token probs } @@ -3068,8 +3103,12 @@ struct server_context { res->id = slot.task->id; res->id_slot = slot.id; - res->index = slot.task->index; - res->content = slot.generated_text; + res->index = slot.task->index; + std::string response_content = slot.generated_text; + if (slot.minimax_reasoning_prefix_injected) { + response_content.insert(0, "\n"); + } + res->content = std::move(response_content); res->tokens = std::move(slot.generated_tokens); res->timings = slot.get_timings(); res->prompt = slot.task->tokens.detokenize(ctx, true); From 39351b1ec2bd31386b7e3b5ee62f9d20fbc3a9c9 Mon Sep 17 00:00:00 2001 From: Pascal Date: Sun, 2 Nov 2025 08:37:15 +0100 Subject: [PATCH 2/3] server: defer minimax-m2 synthetic until first generated token --- tools/server/server.cpp | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 05045b8384d51..bc9dc6cf0e425 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1664,6 +1664,7 @@ struct server_slot { bool has_new_line = false; bool truncated = false; bool minimax_reasoning_prefix_injected = false; + bool minimax_reasoning_prefix_streamed = false; stop_type stop; @@ -1735,6 +1736,7 @@ struct server_slot { has_new_line = false; truncated = false; minimax_reasoning_prefix_injected = false; + minimax_reasoning_prefix_streamed = false; stop = STOP_TYPE_NONE; stopping_word = ""; n_sent_text = 0; @@ -2804,16 +2806,8 @@ struct server_context { const bool needs_minimax_prefix = slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2; - if (needs_minimax_prefix) { - slot.minimax_reasoning_prefix_injected = true; - if (slot.task->params.stream) { - completion_token_output prefix_chunk{}; - prefix_chunk.tok = LLAMA_TOKEN_NULL; - prefix_chunk.prob = 0.0f; - prefix_chunk.text_to_send = "\n"; - send_partial_response(slot, prefix_chunk, false); - } - } + slot.minimax_reasoning_prefix_injected = needs_minimax_prefix; + slot.minimax_reasoning_prefix_streamed = false; SLT_INF(slot, "%s", "processing task\n"); @@ -2874,8 +2868,26 @@ struct server_context { result.text_to_send = token_str; slot.add_token(result); result.text_to_send = std::move(delta_to_send); - if (slot.task->params.stream) { - send_partial_response(slot, result, false); + + auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) { + if (!slot.task->params.stream) { + return; + } + + if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed) { + completion_token_output prefix_chunk{}; + prefix_chunk.tok = LLAMA_TOKEN_NULL; + prefix_chunk.prob = 0.0f; + prefix_chunk.text_to_send = "\n"; + send_partial_response(slot, prefix_chunk, false); + slot.minimax_reasoning_prefix_streamed = true; + } + + send_partial_response(slot, chunk, false); + }; + + if (send_text) { + stream_with_minimax_prefix(result); } } From 57db9d7bfc773b7fdfd9f89c6e41c959fa704cf4 Mon Sep 17 00:00:00 2001 From: Pascal Date: Sun, 2 Nov 2025 16:45:21 +0100 Subject: [PATCH 3/3] server: address review feedback from ngxson Move minimax-m2 prefix injection logic from server.cpp to chat.cpp via common_chat_stream_state --- common/chat.cpp | 38 +++++++++++++++++++++++++++ common/chat.h | 26 ++++++++++++++++++ tools/server/server.cpp | 58 +++++++++++------------------------------ 3 files changed, 79 insertions(+), 43 deletions(-) diff --git a/common/chat.cpp b/common/chat.cpp index dff5f3c1f6579..a6a3e8f85dca5 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -676,6 +676,35 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo throw std::runtime_error("Unknown reasoning format: " + format); } +void common_chat_stream_state::init(const common_chat_syntax & syntax) { + reasoning_prefix_streamed_ = false; + + if (syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) { + reasoning_prefix_ = "\n"; + } else { + reasoning_prefix_.clear(); + } +} + +std::string common_chat_stream_state::apply_reasoning_prefix(const std::string & text) const { + if (reasoning_prefix_.empty()) { + return text; + } + + std::string result(reasoning_prefix_); + result += text; + return result; +} + +std::optional common_chat_stream_state::consume_reasoning_prefix() { + if (!reasoning_prefix_pending()) { + return std::nullopt; + } + + reasoning_prefix_streamed_ = true; + return reasoning_prefix_; +} + static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) { std::string arguments; if (builder.is_partial()) { @@ -3154,3 +3183,12 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co } return msg; } + +common_chat_msg common_chat_parse_stream( + const std::string & input, + bool is_partial, + common_chat_stream_state & stream_state, + const common_chat_syntax & syntax) { + const auto text_to_parse = stream_state.apply_reasoning_prefix(input); + return common_chat_parse(text_to_parse, is_partial, syntax); +} diff --git a/common/chat.h b/common/chat.h index 50efb0d4e516f..6edb5418f30bd 100644 --- a/common/chat.h +++ b/common/chat.h @@ -8,6 +8,7 @@ #include #include #include +#include struct common_chat_templates; @@ -159,6 +160,26 @@ struct common_chat_syntax { bool parse_tool_calls = true; }; +struct common_chat_stream_state { + common_chat_stream_state() = default; + explicit common_chat_stream_state(const common_chat_syntax & syntax) { init(syntax); } + + void init(const common_chat_syntax & syntax); + + std::string apply_reasoning_prefix(const std::string & text) const; + + std::optional consume_reasoning_prefix(); + + bool has_reasoning_prefix() const { return !reasoning_prefix_.empty(); } + bool reasoning_prefix_pending() const { return has_reasoning_prefix() && !reasoning_prefix_streamed_; } + const std::string & reasoning_prefix() const { return reasoning_prefix_; } + void mark_reasoning_prefix_streamed() { reasoning_prefix_streamed_ = true; } + +private: + std::string reasoning_prefix_; + bool reasoning_prefix_streamed_ = false; +}; + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); @@ -200,6 +221,11 @@ const char* common_chat_format_name(common_chat_format format); const char* common_reasoning_format_name(common_reasoning_format format); common_reasoning_format common_reasoning_format_from_name(const std::string & format); common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); +common_chat_msg common_chat_parse_stream( + const std::string & input, + bool is_partial, + common_chat_stream_state & stream_state, + const common_chat_syntax & syntax); common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); diff --git a/tools/server/server.cpp b/tools/server/server.cpp index bc9dc6cf0e425..20de5e41ef91c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -1663,8 +1663,7 @@ struct server_slot { bool has_next_token = true; bool has_new_line = false; bool truncated = false; - bool minimax_reasoning_prefix_injected = false; - bool minimax_reasoning_prefix_streamed = false; + common_chat_stream_state reasoning_stream_state; stop_type stop; @@ -1735,8 +1734,7 @@ struct server_slot { generated_text = ""; has_new_line = false; truncated = false; - minimax_reasoning_prefix_injected = false; - minimax_reasoning_prefix_streamed = false; + reasoning_stream_state = {}; stop = STOP_TYPE_NONE; stopping_word = ""; n_sent_text = 0; @@ -1863,14 +1861,12 @@ struct server_slot { GGML_ASSERT(task); auto previous_msg = chat_msg; - std::string text_to_parse = generated_text; - if (minimax_reasoning_prefix_injected) { - text_to_parse.insert(0, "\n"); - } + const auto text_to_parse = reasoning_stream_state.apply_reasoning_prefix(generated_text); SRV_DBG("Parsing chat message: %s\n", text_to_parse.c_str()); - auto new_msg = common_chat_parse( - text_to_parse, + auto new_msg = common_chat_parse_stream( + generated_text, /* is_partial= */ stop != STOP_TYPE_EOS, + reasoning_stream_state, task->params.oaicompat_chat_syntax); if (!new_msg.empty()) { new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id); @@ -2804,10 +2800,7 @@ struct server_context { slot.state = SLOT_STATE_STARTED; - const bool needs_minimax_prefix = - slot.task->params.oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2; - slot.minimax_reasoning_prefix_injected = needs_minimax_prefix; - slot.minimax_reasoning_prefix_streamed = false; + slot.reasoning_stream_state.init(slot.task->params.oaicompat_chat_syntax); SLT_INF(slot, "%s", "processing task\n"); @@ -2869,25 +2862,16 @@ struct server_context { slot.add_token(result); result.text_to_send = std::move(delta_to_send); - auto stream_with_minimax_prefix = [&](const completion_token_output & chunk) { - if (!slot.task->params.stream) { - return; - } - - if (slot.minimax_reasoning_prefix_injected && !slot.minimax_reasoning_prefix_streamed) { + if (send_text && slot.task->params.stream) { + if (auto prefix = slot.reasoning_stream_state.consume_reasoning_prefix()) { completion_token_output prefix_chunk{}; prefix_chunk.tok = LLAMA_TOKEN_NULL; prefix_chunk.prob = 0.0f; - prefix_chunk.text_to_send = "\n"; + prefix_chunk.text_to_send = *prefix; send_partial_response(slot, prefix_chunk, false); - slot.minimax_reasoning_prefix_streamed = true; } - send_partial_response(slot, chunk, false); - }; - - if (send_text) { - stream_with_minimax_prefix(result); + send_partial_response(slot, result, false); } } @@ -3058,11 +3042,7 @@ struct server_context { return true; } - void send_partial_response( - server_slot & slot, - const completion_token_output & tkn, - bool is_progress, - const std::vector * forced_diffs = nullptr) { + void send_partial_response(server_slot & slot, const completion_token_output & tkn, bool is_progress) { auto res = std::make_unique(); res->id = slot.task->id; @@ -3080,11 +3060,7 @@ struct server_context { res->tokens = { tkn.tok }; } - if (forced_diffs) { - res->oaicompat_msg_diffs = *forced_diffs; - } else { - slot.update_chat_msg(res->oaicompat_msg_diffs); - } + slot.update_chat_msg(res->oaicompat_msg_diffs); } res->n_decoded = slot.n_decoded; @@ -3115,12 +3091,8 @@ struct server_context { res->id = slot.task->id; res->id_slot = slot.id; - res->index = slot.task->index; - std::string response_content = slot.generated_text; - if (slot.minimax_reasoning_prefix_injected) { - response_content.insert(0, "\n"); - } - res->content = std::move(response_content); + res->index = slot.task->index; + res->content = slot.reasoning_stream_state.apply_reasoning_prefix(slot.generated_text); res->tokens = std::move(slot.generated_tokens); res->timings = slot.get_timings(); res->prompt = slot.task->tokens.detokenize(ctx, true);