diff --git a/common/arg.cpp b/common/arg.cpp index d8f9bbd24301f..85e0018760688 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3442,6 +3442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- none: leaves thoughts unparsed in `message.content`\n" "- deepseek: puts thoughts in `message.reasoning_content`\n" "- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`\n" + "- minimax-m2: streams a synthetic opening `` and keeps `` tags in `message.content`\n" "(default: auto)", [](common_params & params, const std::string & value) { params.reasoning_format = common_reasoning_format_from_name(value); diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index ff83102788d49..f5728ef65bdb8 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -171,7 +171,8 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) { bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) { std::string pending_reasoning_prefix; - if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) { + if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE || + syntax_.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) { return false; } diff --git a/common/chat.cpp b/common/chat.cpp index 63583fb22489d..a6a3e8f85dca5 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -655,6 +655,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) { case COMMON_REASONING_FORMAT_AUTO: return "auto"; case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek"; case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy"; + case COMMON_REASONING_FORMAT_MINIMAX_M2: return "minimax-m2"; default: throw std::runtime_error("Unknown reasoning format"); } @@ -669,10 +670,41 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo return COMMON_REASONING_FORMAT_DEEPSEEK; } else if (format == "deepseek-legacy") { return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; + } else if (format == "minimax-m2") { + return COMMON_REASONING_FORMAT_MINIMAX_M2; } throw std::runtime_error("Unknown reasoning format: " + format); } +void common_chat_stream_state::init(const common_chat_syntax & syntax) { + reasoning_prefix_streamed_ = false; + + if (syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) { + reasoning_prefix_ = "\n"; + } else { + reasoning_prefix_.clear(); + } +} + +std::string common_chat_stream_state::apply_reasoning_prefix(const std::string & text) const { + if (reasoning_prefix_.empty()) { + return text; + } + + std::string result(reasoning_prefix_); + result += text; + return result; +} + +std::optional common_chat_stream_state::consume_reasoning_prefix() { + if (!reasoning_prefix_pending()) { + return std::nullopt; + } + + reasoning_prefix_streamed_ = true; + return reasoning_prefix_; +} + static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) { std::string arguments; if (builder.is_partial()) { @@ -1790,7 +1822,8 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) { // <|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|> common_chat_parse_deepseek_v3_1_content(builder); } else { - if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) { + if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || + builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) { LOG_DBG("%s: reasoning_format none, adding content\n", __func__); common_chat_parse_deepseek_v3_1_content(builder); return; @@ -2001,7 +2034,9 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) { if (regex_match(analysis_regex, header)) { builder.move_to(header_start_pos); - if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) { + if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || + builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2 || + builder.syntax().reasoning_in_content) { builder.add_content(consume_end(true)); } else { builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>"); @@ -3148,3 +3183,12 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co } return msg; } + +common_chat_msg common_chat_parse_stream( + const std::string & input, + bool is_partial, + common_chat_stream_state & stream_state, + const common_chat_syntax & syntax) { + const auto text_to_parse = stream_state.apply_reasoning_prefix(input); + return common_chat_parse(text_to_parse, is_partial, syntax); +} diff --git a/common/chat.h b/common/chat.h index 50efb0d4e516f..6edb5418f30bd 100644 --- a/common/chat.h +++ b/common/chat.h @@ -8,6 +8,7 @@ #include #include #include +#include struct common_chat_templates; @@ -159,6 +160,26 @@ struct common_chat_syntax { bool parse_tool_calls = true; }; +struct common_chat_stream_state { + common_chat_stream_state() = default; + explicit common_chat_stream_state(const common_chat_syntax & syntax) { init(syntax); } + + void init(const common_chat_syntax & syntax); + + std::string apply_reasoning_prefix(const std::string & text) const; + + std::optional consume_reasoning_prefix(); + + bool has_reasoning_prefix() const { return !reasoning_prefix_.empty(); } + bool reasoning_prefix_pending() const { return has_reasoning_prefix() && !reasoning_prefix_streamed_; } + const std::string & reasoning_prefix() const { return reasoning_prefix_; } + void mark_reasoning_prefix_streamed() { reasoning_prefix_streamed_ = true; } + +private: + std::string reasoning_prefix_; + bool reasoning_prefix_streamed_ = false; +}; + // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid bool common_chat_verify_template(const std::string & tmpl, bool use_jinja); @@ -200,6 +221,11 @@ const char* common_chat_format_name(common_chat_format format); const char* common_reasoning_format_name(common_reasoning_format format); common_reasoning_format common_reasoning_format_from_name(const std::string & format); common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); +common_chat_msg common_chat_parse_stream( + const std::string & input, + bool is_partial, + common_chat_stream_state & stream_state, + const common_chat_syntax & syntax); common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); diff --git a/common/common.h b/common/common.h index a8cb630ea5805..7d326dc1069b0 100644 --- a/common/common.h +++ b/common/common.h @@ -249,6 +249,7 @@ enum common_reasoning_format { COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content` COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in tags in stream mode COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas. + COMMON_REASONING_FORMAT_MINIMAX_M2, // Stream a synthetic opening tag and keep tags in `message.content` for MiniMax-M2 compatibility // do not extend this enum unless you absolutely have to // in most cases, use COMMON_REASONING_FORMAT_AUTO // see: https://github.com/ggml-org/llama.cpp/pull/15408 diff --git a/tools/server/README.md b/tools/server/README.md index c16d0bd6dcd7f..4cbdfe42a3e81 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co | `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | | `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) | -| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: deepseek)
(env: LLAMA_ARG_THINK) | +| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
- minimax-m2: Stream a synthetic opening tag and keep tags in `message.content` for MiniMax-M2 compatibility
(default: deepseek)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 92d30664e41f4..20de5e41ef91c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -443,7 +443,10 @@ struct server_task { } common_reasoning_format reasoning_format = params_base.reasoning_format; if (data.contains("reasoning_format")) { - reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); + const auto requested = common_reasoning_format_from_name(data.at("reasoning_format").get()); + if (requested != COMMON_REASONING_FORMAT_AUTO) { + reasoning_format = requested; + } } params.oaicompat_chat_syntax.reasoning_format = reasoning_format; params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); @@ -1660,6 +1663,7 @@ struct server_slot { bool has_next_token = true; bool has_new_line = false; bool truncated = false; + common_chat_stream_state reasoning_stream_state; stop_type stop; @@ -1730,6 +1734,7 @@ struct server_slot { generated_text = ""; has_new_line = false; truncated = false; + reasoning_stream_state = {}; stop = STOP_TYPE_NONE; stopping_word = ""; n_sent_text = 0; @@ -1856,10 +1861,12 @@ struct server_slot { GGML_ASSERT(task); auto previous_msg = chat_msg; - SRV_DBG("Parsing chat message: %s\n", generated_text.c_str()); - auto new_msg = common_chat_parse( + const auto text_to_parse = reasoning_stream_state.apply_reasoning_prefix(generated_text); + SRV_DBG("Parsing chat message: %s\n", text_to_parse.c_str()); + auto new_msg = common_chat_parse_stream( generated_text, /* is_partial= */ stop != STOP_TYPE_EOS, + reasoning_stream_state, task->params.oaicompat_chat_syntax); if (!new_msg.empty()) { new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id); @@ -2793,6 +2800,8 @@ struct server_context { slot.state = SLOT_STATE_STARTED; + slot.reasoning_stream_state.init(slot.task->params.oaicompat_chat_syntax); + SLT_INF(slot, "%s", "processing task\n"); return true; @@ -2848,8 +2857,20 @@ struct server_context { result.text_to_send = ""; } + std::string delta_to_send = result.text_to_send; + result.text_to_send = token_str; slot.add_token(result); - if (slot.task->params.stream) { + result.text_to_send = std::move(delta_to_send); + + if (send_text && slot.task->params.stream) { + if (auto prefix = slot.reasoning_stream_state.consume_reasoning_prefix()) { + completion_token_output prefix_chunk{}; + prefix_chunk.tok = LLAMA_TOKEN_NULL; + prefix_chunk.prob = 0.0f; + prefix_chunk.text_to_send = *prefix; + send_partial_response(slot, prefix_chunk, false); + } + send_partial_response(slot, result, false); } } @@ -3035,7 +3056,9 @@ struct server_context { res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000); } else { res->content = tkn.text_to_send; - res->tokens = { tkn.tok }; + if (tkn.tok != LLAMA_TOKEN_NULL) { + res->tokens = { tkn.tok }; + } slot.update_chat_msg(res->oaicompat_msg_diffs); } @@ -3050,7 +3073,7 @@ struct server_context { res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id; // populate res.probs_output - if (slot.task->params.sampling.n_probs > 0) { + if (slot.task->params.sampling.n_probs > 0 && tkn.tok != LLAMA_TOKEN_NULL) { res->prob_output = tkn; // copy the token probs } @@ -3069,7 +3092,7 @@ struct server_context { res->id_slot = slot.id; res->index = slot.task->index; - res->content = slot.generated_text; + res->content = slot.reasoning_stream_state.apply_reasoning_prefix(slot.generated_text); res->tokens = std::move(slot.generated_tokens); res->timings = slot.get_timings(); res->prompt = slot.task->tokens.detokenize(ctx, true);