model : add reasoning/tool support for Llama 3.x Nemotron

aldehir · aldehir · commit 56bb7249ffe7 · 2025-08-04T23:14:18.000-05:00
diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp
@@ -98,6 +98,20 @@ bool common_chat_msg_parser::try_consume_literal(const std::string & literal) {
     return true;
 }
 
+bool common_chat_msg_parser::try_consume_partial_literal(const std::string & literal) {
+    if (is_partial_) {
+        auto idx = string_find_partial_stop(input_, literal);
+        if (idx != std::string::npos && idx >= pos_) {
+            auto end = input_.size();
+            if (end < idx + literal.size()) {
+                throw common_chat_msg_partial_exception(literal);
+            }
+        }
+    }
+
+    return try_consume_literal(literal);
+}
+
 std::optional<common_chat_msg_parser::find_regex_result>  common_chat_msg_parser::try_find_literal(const std::string & literal) {
     auto idx = input_.find(literal, pos_);
     if (idx != std::string::npos) {
@@ -145,7 +159,7 @@ bool common_chat_msg_parser::try_parse_reasoning(const std::string & start_think
         }
     };
     if (syntax_.reasoning_format != COMMON_REASONING_FORMAT_NONE) {
-        if (syntax_.thinking_forced_open || try_consume_literal(start_think)) {
+        if (syntax_.thinking_forced_open || try_consume_partial_literal(start_think)) {
             if (auto res = try_find_literal(end_think)) {
                 handle_reasoning(res->prelude, /* closed */ true);
                 consume_spaces();
diff --git a/common/chat-parser.h b/common/chat-parser.h
@@ -82,6 +82,7 @@ class common_chat_msg_parser {
     std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
 
     bool try_consume_literal(const std::string & literal);
+    bool try_consume_partial_literal(const std::string & literal);
 
     std::optional<find_regex_result> try_find_literal(const std::string & literal);
 
diff --git a/common/chat.cpp b/common/chat.cpp
@@ -586,6 +586,7 @@ const char * common_chat_format_name(common_chat_format format) {
         case COMMON_CHAT_FORMAT_MISTRAL_NEMO: return "Mistral Nemo";
         case COMMON_CHAT_FORMAT_LLAMA_3_X: return "Llama 3.x";
         case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS: return "Llama 3.x with builtin tools";
+        case COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON: return "Llama 3.x Nemotron";
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1: return "DeepSeek R1";
         case COMMON_CHAT_FORMAT_FIREFUNCTION_V2: return "FireFunction v2";
         case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2: return "Functionary v3.2";
@@ -1698,6 +1699,57 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
     builder.add_content(builder.consume_rest());
 }
 
+static common_chat_params common_chat_params_init_llama_3_x_nemotron(const common_chat_template & tmpl, const struct templates_params & inputs) {
+    common_chat_params data;
+    data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+    data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+        auto schemas = json::array();
+        foreach_function(inputs.tools, [&](const json & tool) {
+            const auto & function = tool.at("function");
+            schemas.push_back({
+                {"type", "object"},
+                {"properties", {
+                    {"name", {
+                        {"type", "string"},
+                        {"const", function.at("name")},
+                    }},
+                    {"arguments", function.at("parameters")},
+                }},
+                {"required", json::array({"name", "arguments"})},
+            });
+        });
+        auto schema = json {
+            {"type", "array"},
+            {"items", schemas.size() == 1 ? schemas[0] : json {{"anyOf", schemas}}},
+            {"minItems", 1},
+        };
+        builder.add_rule("root", "\"<TOOLCALL>\" " + builder.add_schema("tool_calls", schema) + " \"</TOOLCALL>\"");
+    });
+    data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "<TOOLCALL>"});
+    data.preserved_tokens = {
+        "<TOOLCALL>",
+        "</TOOLCALL>"
+    };
+    data.prompt = apply(tmpl, inputs);
+    data.format = COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON;
+    return data;
+}
+static void common_chat_parse_llama_3_x_nemotron(common_chat_msg_parser & builder) {
+    builder.try_parse_reasoning("<think>", "</think>");
+    if (!builder.syntax().parse_tool_calls) {
+        builder.add_content(builder.consume_rest());
+        return;
+    }
+
+    static const common_regex prefix(regex_escape("<TOOLCALL>"));
+    static const common_regex suffix(regex_escape("</TOOLCALL>"));
+
+    parse_prefixed_json_tool_call_array(builder, prefix);
+    if (!builder.try_find_regex(suffix)) {
+        builder.consume_rest();
+    }
+}
+
 static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
     common_chat_params data;
     data.prompt = apply(tmpl, inputs);
@@ -1800,6 +1852,11 @@ static common_chat_params common_chat_templates_apply_jinja(
         return common_chat_params_init_llama_3_x(tmpl, params, allow_python_tag_builtin_tools);
     }
 
+    // Llama 3.3 Nemo (w/ tools)
+    if (src.find("<TOOLCALL>") != std::string::npos) {
+        return common_chat_params_init_llama_3_x_nemotron(tmpl, params);
+    }
+
     // Plain handler (no tools)
     if (params.tools.is_null() || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) {
         return common_chat_params_init_without_tools(tmpl, params);
@@ -1905,6 +1962,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
         case COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS:
             common_chat_parse_llama_3_1(builder, /* with_builtin_tools= */ true);
             break;
+        case COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON:
+            common_chat_parse_llama_3_x_nemotron(builder);
+            break;
         case COMMON_CHAT_FORMAT_DEEPSEEK_R1:
             common_chat_parse_deepseek_r1(builder);
             break;
diff --git a/common/chat.h b/common/chat.h
@@ -103,6 +103,7 @@ enum common_chat_format {
     COMMON_CHAT_FORMAT_MISTRAL_NEMO,
     COMMON_CHAT_FORMAT_LLAMA_3_X,
     COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON,
     COMMON_CHAT_FORMAT_DEEPSEEK_R1,
     COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
     COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
diff --git a/models/templates/README.md b/models/templates/README.md
@@ -18,6 +18,7 @@ These templates can be updated with the following commands:
 ./scripts/get_chat_template.py mistralai/Mistral-Nemo-Instruct-2407          > models/templates/mistralai-Mistral-Nemo-Instruct-2407.jinja
 ./scripts/get_chat_template.py NousResearch/Hermes-2-Pro-Llama-3-8B tool_use > models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja
 ./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use   > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
+./scripts/get_chat_template.py nvidia/Llama-3_3-Nemotron-Super-49B-v1_5      > models/templates/nvidia/nvidia-Llama-3_3-Nemotron-Super-49B-v1_5.jinja
 ./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct                      > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
 ./scripts/get_chat_template.py Qwen/QwQ-32B                                  > models/templates/Qwen-QwQ-32B.jinja
 ./scripts/get_chat_template.py Qwen/Qwen3-0.6B                               > models/templates/Qwen-Qwen3-0.6B.jinja
diff --git a/models/templates/nvidia-Llama-3_3-Nemotron-Super-49B-v1_5.jinja b/models/templates/nvidia-Llama-3_3-Nemotron-Super-49B-v1_5.jinja
@@ -0,0 +1,20 @@
+{% set bos = "<|begin_of_text|>" %}{%- set enable_thinking = true -%}{% set system_start_header = "<|start_header_id|>" %}{% set system_end_header = "<|end_header_id|>
+
+" %}{% set start_header = "<|start_header_id|>" %}{% set end_header = "<|end_header_id|>
+
+" %}{% set eot = "<|eot_id|>" %}{% set system_token = "system" %}{% set user_token = "user" %}{% set assistant_token = "assistant" %}{% set tool_token = "tool" %}{{- bos ~ system_start_header ~ system_token ~ system_end_header -}}{%- if messages[0].role == 'system' and messages[0].content != '' -%}{%- set system_content = messages[0].content -%}{%- if '/no_think' in system_content -%}{%- set system_content = system_content.replace('/no_think', '')|trim -%}{%- set enable_thinking = false -%}{%- elif '/think' in system_content -%}{%- set system_content = system_content.replace('/think', '')|trim -%}{%- set enable_thinking = true -%}{%- endif -%}{{- system_content + '
+
+' -}}{%- endif -%}{%- if tools -%}{{- 'You can use the following tools to assist the user if required:
+<AVAILABLE_TOOLS>[' -}}{%- for tool in tools -%}{{- (tool.function if tool.function is defined else tool) | tojson -}}{{- ', ' if not loop.last else '' -}}{%- endfor -%}{{- ']</AVAILABLE_TOOLS>
+
+If you decide to call any tool(s), use the following format:
+<TOOLCALL>[{{"name": "tool_name1", "arguments": "tool_args1"}}, {{"name": "tool_name2", "arguments": "tool_args2"}}]</TOOLCALL>
+
+Response from tool(s) will be returned in this format:
+<TOOL_RESPONSE>[{{"response": "tool_response1"}}, {{"response": "tool_response2"}}]</TOOL_RESPONSE>
+
+Based on the results returned by the tool(s), you can call additional tools if needed, correct tool calls if any errors are found, or just respond with the answer to the user.' -}}{%- endif -%}{{- eot -}}{%- for message in messages -%}{%- if message.role == user_token -%}{{- start_header ~ user_token ~ end_header -}}{{ message.content -}}{{ eot -}}{%- elif message.role == assistant_token -%}{%- if '</think>' in message.content -%}{%- set content = message.content.split('</think>')[-1].lstrip() -%}{%- else -%}{%- set content = message.content -%}{%- endif -%}{{- start_header ~ assistant_token ~ end_header -}}{{ content -}}{%- if message.tool_calls -%}{{- '<TOOLCALL>[' -}}{%- for call in message.tool_calls -%}{%- set fn = call.function if call.function is defined else call -%}{{- '{"name": "' + fn.name + '", "arguments": ' -}}{%- if fn.arguments is string -%}{{- fn.arguments -}}{%- else -%}{{- fn.arguments | tojson -}}{%- endif -%}{{- '}' + (', ' if not loop.last else '') -}}{%- endfor -%}{{- ']</TOOLCALL>' -}}{%- endif -%}{{- eot -}}{%- elif message.role == tool_token -%}{%- if loop.first or (messages[loop.index0 - 1].role != tool_token) -%}{{- start_header ~ tool_token ~ end_header -}}{{ '<TOOL_RESPONSE>[' -}}{%- endif -%}{{- message.content -}}{{- ', ' if not loop.last and (messages[loop.index0 + 1].role == tool_token) else '' -}}{%- if loop.last or (messages[loop.index0 + 1].role != tool_token) -%}{{- ']</TOOL_RESPONSE>' -}}{{ eot -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{- start_header ~ assistant_token ~ end_header -}}{%- if not enable_thinking -%}{{- '<think>
+
+</think>
+
+' -}}{%- endif -%}{%- endif -%}
diff --git a/tests/test-chat-parser.cpp b/tests/test-chat-parser.cpp
@@ -99,6 +99,47 @@ static void test_reasoning() {
     assert_equals("<think>Cogito</think>", builder.result().content);
     assert_equals("Ergo sum", builder.consume_rest());
   }
+  {
+    common_chat_msg_parser builder("<tnk>Cogito", /* is_partial= */ true, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+    });
+
+    assert_equals(true, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("Cogito", builder.result().reasoning_content);
+    assert_equals("", builder.consume_rest());
+  }
+  {
+    common_chat_msg_parser builder("<t", /* is_partial= */ true, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+    });
+
+    try {
+        builder.try_parse_reasoning("<tnk>", "</tnk>");
+        throw std::runtime_error("Expected exception");
+    } catch (const std::exception & e) {
+        if (std::string(e.what()).find("<tnk>") == std::string::npos) {
+            throw std::runtime_error("Expected exception about partial <tnk>");
+        }
+    }
+  }
+  {
+    common_chat_msg_parser builder("<think>Cogito", /* is_partial= */ true, {
+        /* .format = */ COMMON_CHAT_FORMAT_CONTENT_ONLY,
+        /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+        /* .reasoning_in_content = */ false,
+        /* .thinking_forced_open = */ false,
+    });
+
+    assert_equals(false, builder.try_parse_reasoning("<tnk>", "</tnk>"));
+    assert_equals("", builder.result().reasoning_content);
+    assert_equals("<think>Cogito", builder.consume_rest());
+  }
 }
 
 static void test_regex() {
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
@@ -1386,6 +1386,77 @@ static void test_template_output_parsers() {
                 "{\"arg1\": 1}\n"
                 "```<｜tool▁call▁end｜><｜tool▁calls▁end｜>");
     }
+    {
+        auto tmpls = read_templates("models/templates/nvidia-Llama-3_3-Nemotron-Super-49B-v1_5.jinja");
+        std::vector<std::string>   end_tokens{ "<|eot_id|>" };
+
+        assert_equals(COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
+
+        test_templates(tmpls.get(), end_tokens, message_assist, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+        test_templates(tmpls.get(), end_tokens, message_assist_thoughts, tools, "Hello, world!\nWhat's up?", /* expect_grammar_triggered= */ false);
+
+        assert_msg_equals(message_assist_thoughts_unparsed_deepseek,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON}));
+        assert_msg_equals(message_assist_thoughts,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts,
+            common_chat_parse(
+                "I'm\nthinking</think>Hello, world!\nWhat's up?",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                    /* .reasoning_in_content = */ false,
+                    /* .thinking_forced_open = */ true,
+                }));
+
+        assert_msg_equals(message_assist_call_thoughts_unparsed,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>\n\n"
+                "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON}));
+        assert_msg_equals(message_assist_call,
+            common_chat_parse(
+                "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
+                /* is_partial= */ false,
+                {COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON}));
+        assert_msg_equals(message_assist_call_thoughts,
+            common_chat_parse(
+                "<think>I'm\nthinking</think>\n\n"
+                "<TOOLCALL>[{\"name\": \"special_function\", \"arguments\": {\"arg1\": 1}}]</TOOLCALL>",
+                /* is_partial= */ false,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+
+        assert_msg_equals(message_assist_empty,
+            common_chat_parse(
+                "<th",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+        assert_msg_equals(message_assist_thoughts_no_content,
+            common_chat_parse(
+                "<think>I'm\nthinking",
+                /* is_partial= */ true,
+                {
+                    /* .format = */ COMMON_CHAT_FORMAT_LLAMA_3_X_NEMOTRON,
+                    /* .reasoning_format = */ COMMON_REASONING_FORMAT_DEEPSEEK,
+                }));
+    }
 }
 
 static void test_msg_diffs_compute() {