Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3442,6 +3442,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
"- none: leaves thoughts unparsed in `message.content`\n"
"- deepseek: puts thoughts in `message.reasoning_content`\n"
"- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`\n"
"- minimax-m2: streams a synthetic opening `<think>` and keeps `</think>` tags in `message.content`\n"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we name this something more generic? like synthetic

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ngxson I've moved as much as possible to chat.cpp. For parameter naming, I kept consistency with existing formats, treating the first model (DeepSeek) as the "parent" behavior reference.

However, we could prepare a more modular refactor by renaming the parameters to better reflect their actual behavior:

  • none -> disables the backend parser (name already good)
  • deepseek -> remove or document it's an "auto" alias (most used, backend reasoning parser, writes reasoning inside reasoning_content chunks: the OpenAI-compatible target)
  • deepseek-legacy -> rename to clone or something clearer? (inline <think> tags + duplicate inside reasoning_content = Legacy+OAI-Compat mirroring, I don't have a use case for this)
  • minimax-m2 (this PR) -> inline reasoning tags + adds a missing <think> opening tag

To make this truly generic, we'd need an additional parameter to define the prepended string instead of hardcoding <think>. Use case: anyone dealing with Jinja templates that pre-open reasoning tags, causing the model to not regenerate them, making subsequent parsing difficult?

Would you prefer I open a follow-up issue to discuss a more generic synthetic-prefix approach with configurable strings?

"(default: auto)",
[](common_params & params, const std::string & value) {
params.reasoning_format = common_reasoning_format_from_name(value);
Expand Down
3 changes: 2 additions & 1 deletion common/chat-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ void common_chat_msg_parser::consume_literal(const std::string & literal) {
bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think, const std::string & end_think) {
std::string pending_reasoning_prefix;

if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE) {
if (syntax_.reasoning_format == COMMON_REASONING_FORMAT_NONE ||
syntax_.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) {
return false;
}

Expand Down
48 changes: 46 additions & 2 deletions common/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
case COMMON_REASONING_FORMAT_AUTO: return "auto";
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
case COMMON_REASONING_FORMAT_MINIMAX_M2: return "minimax-m2";
default:
throw std::runtime_error("Unknown reasoning format");
}
Expand All @@ -669,10 +670,41 @@ common_reasoning_format common_reasoning_format_from_name(const std::string & fo
return COMMON_REASONING_FORMAT_DEEPSEEK;
} else if (format == "deepseek-legacy") {
return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY;
} else if (format == "minimax-m2") {
return COMMON_REASONING_FORMAT_MINIMAX_M2;
}
throw std::runtime_error("Unknown reasoning format: " + format);
}

void common_chat_stream_state::init(const common_chat_syntax & syntax) {
reasoning_prefix_streamed_ = false;

if (syntax.reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) {
reasoning_prefix_ = "<think>\n";
} else {
reasoning_prefix_.clear();
}
}

std::string common_chat_stream_state::apply_reasoning_prefix(const std::string & text) const {
if (reasoning_prefix_.empty()) {
return text;
}

std::string result(reasoning_prefix_);
result += text;
return result;
}

std::optional<std::string> common_chat_stream_state::consume_reasoning_prefix() {
if (!reasoning_prefix_pending()) {
return std::nullopt;
}

reasoning_prefix_streamed_ = true;
return reasoning_prefix_;
}

static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
std::string arguments;
if (builder.is_partial()) {
Expand Down Expand Up @@ -1790,7 +1822,8 @@ static void common_chat_parse_deepseek_v3_1(common_chat_msg_parser & builder) {
// </think><|tool▁calls▁begin|><|tool▁call▁begin|>function<|tool▁sep|>NAME\n```json\nJSON\n```<|tool▁call▁end|><|tool▁calls▁end|>
common_chat_parse_deepseek_v3_1_content(builder);
} else {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE) {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE ||
builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2) {
LOG_DBG("%s: reasoning_format none, adding content\n", __func__);
common_chat_parse_deepseek_v3_1_content(builder);
return;
Expand Down Expand Up @@ -2001,7 +2034,9 @@ static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {

if (regex_match(analysis_regex, header)) {
builder.move_to(header_start_pos);
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE || builder.syntax().reasoning_in_content) {
if (builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_NONE ||
builder.syntax().reasoning_format == COMMON_REASONING_FORMAT_MINIMAX_M2 ||
builder.syntax().reasoning_in_content) {
builder.add_content(consume_end(true));
} else {
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|end|>");
Expand Down Expand Up @@ -3148,3 +3183,12 @@ common_chat_msg common_chat_parse(const std::string & input, bool is_partial, co
}
return msg;
}

common_chat_msg common_chat_parse_stream(
const std::string & input,
bool is_partial,
common_chat_stream_state & stream_state,
const common_chat_syntax & syntax) {
const auto text_to_parse = stream_state.apply_reasoning_prefix(input);
return common_chat_parse(text_to_parse, is_partial, syntax);
}
26 changes: 26 additions & 0 deletions common/chat.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <string>
#include <vector>
#include <map>
#include <optional>

struct common_chat_templates;

Expand Down Expand Up @@ -159,6 +160,26 @@ struct common_chat_syntax {
bool parse_tool_calls = true;
};

struct common_chat_stream_state {
common_chat_stream_state() = default;
explicit common_chat_stream_state(const common_chat_syntax & syntax) { init(syntax); }

void init(const common_chat_syntax & syntax);

std::string apply_reasoning_prefix(const std::string & text) const;

std::optional<std::string> consume_reasoning_prefix();

bool has_reasoning_prefix() const { return !reasoning_prefix_.empty(); }
bool reasoning_prefix_pending() const { return has_reasoning_prefix() && !reasoning_prefix_streamed_; }
const std::string & reasoning_prefix() const { return reasoning_prefix_; }
void mark_reasoning_prefix_streamed() { reasoning_prefix_streamed_ = true; }

private:
std::string reasoning_prefix_;
bool reasoning_prefix_streamed_ = false;
};

// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);

Expand Down Expand Up @@ -200,6 +221,11 @@ const char* common_chat_format_name(common_chat_format format);
const char* common_reasoning_format_name(common_reasoning_format format);
common_reasoning_format common_reasoning_format_from_name(const std::string & format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_msg common_chat_parse_stream(
const std::string & input,
bool is_partial,
common_chat_stream_state & stream_state,
const common_chat_syntax & syntax);

common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);

Expand Down
1 change: 1 addition & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ enum common_reasoning_format {
COMMON_REASONING_FORMAT_AUTO, // Same as deepseek, using `message.reasoning_content`
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
COMMON_REASONING_FORMAT_MINIMAX_M2, // Stream a synthetic opening <think> tag and keep </think> tags in `message.content` for MiniMax-M2 compatibility
// do not extend this enum unless you absolutely have to
// in most cases, use COMMON_REASONING_FORMAT_AUTO
// see: https://github.com/ggml-org/llama.cpp/pull/15408
Expand Down
2 changes: 1 addition & 1 deletion tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ The project is under active development, and we are [looking for feedback and co
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>- minimax-m2: Stream a synthetic opening <think> tag and keep </think> tags in `message.content` for MiniMax-M2 compatibility<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
Expand Down
37 changes: 30 additions & 7 deletions tools/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,10 @@ struct server_task {
}
common_reasoning_format reasoning_format = params_base.reasoning_format;
if (data.contains("reasoning_format")) {
reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
const auto requested = common_reasoning_format_from_name(data.at("reasoning_format").get<std::string>());
if (requested != COMMON_REASONING_FORMAT_AUTO) {
reasoning_format = requested;
}
}
params.oaicompat_chat_syntax.reasoning_format = reasoning_format;
params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
Expand Down Expand Up @@ -1660,6 +1663,7 @@ struct server_slot {
bool has_next_token = true;
bool has_new_line = false;
bool truncated = false;
common_chat_stream_state reasoning_stream_state;

stop_type stop;

Expand Down Expand Up @@ -1730,6 +1734,7 @@ struct server_slot {
generated_text = "";
has_new_line = false;
truncated = false;
reasoning_stream_state = {};
stop = STOP_TYPE_NONE;
stopping_word = "";
n_sent_text = 0;
Expand Down Expand Up @@ -1856,10 +1861,12 @@ struct server_slot {
GGML_ASSERT(task);

auto previous_msg = chat_msg;
SRV_DBG("Parsing chat message: %s\n", generated_text.c_str());
auto new_msg = common_chat_parse(
const auto text_to_parse = reasoning_stream_state.apply_reasoning_prefix(generated_text);
SRV_DBG("Parsing chat message: %s\n", text_to_parse.c_str());
auto new_msg = common_chat_parse_stream(
generated_text,
/* is_partial= */ stop != STOP_TYPE_EOS,
reasoning_stream_state,
task->params.oaicompat_chat_syntax);
if (!new_msg.empty()) {
new_msg.set_tool_call_ids(generated_tool_call_ids, gen_tool_call_id);
Expand Down Expand Up @@ -2793,6 +2800,8 @@ struct server_context {

slot.state = SLOT_STATE_STARTED;

slot.reasoning_stream_state.init(slot.task->params.oaicompat_chat_syntax);

SLT_INF(slot, "%s", "processing task\n");

return true;
Expand Down Expand Up @@ -2848,8 +2857,20 @@ struct server_context {
result.text_to_send = "";
}

std::string delta_to_send = result.text_to_send;
result.text_to_send = token_str;
slot.add_token(result);
if (slot.task->params.stream) {
result.text_to_send = std::move(delta_to_send);

if (send_text && slot.task->params.stream) {
if (auto prefix = slot.reasoning_stream_state.consume_reasoning_prefix()) {
completion_token_output prefix_chunk{};
prefix_chunk.tok = LLAMA_TOKEN_NULL;
prefix_chunk.prob = 0.0f;
prefix_chunk.text_to_send = *prefix;
send_partial_response(slot, prefix_chunk, false);
}

send_partial_response(slot, result, false);
}
}
Expand Down Expand Up @@ -3035,7 +3056,9 @@ struct server_context {
res->progress.time_ms = (ggml_time_us() - slot.t_start_process_prompt / 1000);
} else {
res->content = tkn.text_to_send;
res->tokens = { tkn.tok };
if (tkn.tok != LLAMA_TOKEN_NULL) {
res->tokens = { tkn.tok };
}

slot.update_chat_msg(res->oaicompat_msg_diffs);
}
Expand All @@ -3050,7 +3073,7 @@ struct server_context {
res->oaicompat_cmpl_id = slot.task->params.oaicompat_cmpl_id;

// populate res.probs_output
if (slot.task->params.sampling.n_probs > 0) {
if (slot.task->params.sampling.n_probs > 0 && tkn.tok != LLAMA_TOKEN_NULL) {
res->prob_output = tkn; // copy the token probs
}

Expand All @@ -3069,7 +3092,7 @@ struct server_context {
res->id_slot = slot.id;

res->index = slot.task->index;
res->content = slot.generated_text;
res->content = slot.reasoning_stream_state.apply_reasoning_prefix(slot.generated_text);
res->tokens = std::move(slot.generated_tokens);
res->timings = slot.get_timings();
res->prompt = slot.task->tokens.detokenize(ctx, true);
Expand Down
Loading