diff --git a/common/chat-parser.cpp b/common/chat-parser.cpp index b3362519a68f3..c0bcd93ef5955 100644 --- a/common/chat-parser.cpp +++ b/common/chat-parser.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include using json = nlohmann::ordered_json; @@ -420,3 +422,590 @@ std::optional common_chat_msg_parse void common_chat_msg_parser::clear_tools() { result_.tool_calls.clear(); } + +// Qwen3-Coder XML tool call parser implementation +namespace { + // Constants for DoS protection + static constexpr size_t MAX_INPUT_SIZE = 1024 * 1024; // 1MB limit + static constexpr size_t MAX_PARAMETER_COUNT = 100; // Maximum parameters per function + static constexpr size_t MAX_TAG_NAME_LENGTH = 256; // Maximum tag name length + static constexpr size_t MAX_ATTRIBUTE_LENGTH = 1024; // Maximum attribute length + + // Helper function to set error details + void set_error(common_chat_msg_parser::XmlParseError & error, + common_chat_msg_parser::XmlParseErrorType type, + size_t position, + const std::string & context, + const std::string & message) { + error.type = type; + error.position = position; + error.context = context; + error.message = message; + } + + // Simple XML tag parser - safer than regex, using string_view for performance + struct XmlTag { + std::string name; + std::string attribute; + std::string content; + size_t start_pos = 0; + size_t end_pos = 0; + }; + + // Find XML tag with optional attribute - ITERATIVE implementation to avoid stack overflow + std::optional find_xml_tag(std::string_view text, std::string_view tag_name, size_t start_pos = 0, + common_chat_msg_parser::XmlParseError * error = nullptr) { + // Input validation for DoS protection + if (text.size() > MAX_INPUT_SIZE) { + LOG_DBG("XML input too large: %zu bytes (max: %zu)\n", text.size(), MAX_INPUT_SIZE); + if (error) { + set_error(*error, common_chat_msg_parser::XmlParseErrorType::INPUT_TOO_LARGE, 0, + std::string(text.substr(0, std::min(text.size(), size_t(100)))), + "XML input exceeds maximum size limit of " + std::to_string(MAX_INPUT_SIZE) + " bytes"); + } + return std::nullopt; + } + + if (tag_name.size() > MAX_TAG_NAME_LENGTH) { + LOG_DBG("Tag name too long: %zu chars (max: %zu)\n", tag_name.size(), MAX_TAG_NAME_LENGTH); + if (error) { + set_error(*error, common_chat_msg_parser::XmlParseErrorType::TAG_NAME_TOO_LONG, 0, + std::string(tag_name), + "Tag name exceeds maximum length of " + std::to_string(MAX_TAG_NAME_LENGTH) + " characters"); + } + return std::nullopt; + } + + if (start_pos >= text.size()) { + return std::nullopt; + } + + // PERFORMANCE OPTIMIZATION: Use string_view to avoid allocations + // Pre-compute tag patterns + const std::string open_tag_start = std::string("<") + std::string(tag_name); + const std::string close_tag = std::string(""; + + // ITERATIVE search to avoid recursion and potential stack overflow + size_t search_pos = start_pos; + while (search_pos < text.size()) { + // Look for opening tag + size_t open_pos = text.find(open_tag_start, search_pos); + if (open_pos == std::string::npos) { + return std::nullopt; + } + + // Validate that this is actually the start of our tag (not a substring) + // Check that the character after tag name is either '>' or '=' or whitespace + size_t check_pos = open_pos + open_tag_start.length(); + if (check_pos < text.size()) { + char next_char = text[check_pos]; + if (next_char != '>' && next_char != '=' && !std::isspace(next_char)) { + // This is a false match (e.g., looking for "tool" but found "tool_call") + // Continue searching from the next position + search_pos = open_pos + 1; + continue; + } + } + + // Find the end of the opening tag + size_t open_end = text.find('>', open_pos); + if (open_end == std::string::npos) { + return std::nullopt; + } + + XmlTag tag; + tag.start_pos = open_pos; + + // Extract attribute if present (for tags like or ) + // PERFORMANCE: Use string_view for substring operations + size_t tag_content_start = open_pos + 1 + tag_name.length(); + if (tag_content_start < open_end) { + // Look for '=' in the tag content + size_t eq_pos = text.find('=', tag_content_start); + if (eq_pos != std::string::npos && eq_pos < open_end) { + // Skip whitespace after '=' + size_t attr_start = eq_pos + 1; + while (attr_start < open_end && std::isspace(text[attr_start])) { + attr_start++; + } + + if (attr_start < open_end) { + size_t attr_end = open_end; + + // Handle quoted attribute values + if (text[attr_start] == '"' || text[attr_start] == '\'') { + char quote_char = text[attr_start]; + attr_start++; // Skip opening quote + + // Find closing quote + size_t quote_end = text.find(quote_char, attr_start); + if (quote_end != std::string::npos && quote_end < open_end) { + attr_end = quote_end; + } else { + // No closing quote found, treat as unquoted + attr_start--; // Go back to include the quote + } + } else { + // Unquoted attribute - trim trailing whitespace + while (attr_end > attr_start && std::isspace(text[attr_end - 1])) { + attr_end--; + } + } + + if (attr_start < attr_end) { + std::string_view attr_view = text.substr(attr_start, attr_end - attr_start); + // Validate attribute length + if (attr_view.size() <= MAX_ATTRIBUTE_LENGTH) { + tag.attribute = std::string(attr_view); + } else { + LOG_DBG("Attribute too long: %zu chars (max: %zu)\n", attr_view.size(), MAX_ATTRIBUTE_LENGTH); + if (error) { + set_error(*error, common_chat_msg_parser::XmlParseErrorType::ATTRIBUTE_TOO_LONG, + open_pos, std::string(attr_view.substr(0, 100)), + "Attribute exceeds maximum length of " + std::to_string(MAX_ATTRIBUTE_LENGTH) + " characters"); + } + return std::nullopt; + } + } + } + } + } + + // Look for closing tag - PERFORMANCE: Search from after opening tag + size_t close_pos = text.find(close_tag, open_end + 1); + if (close_pos == std::string::npos) { + return tag; + } + + tag.end_pos = close_pos + close_tag.length(); + tag.name = std::string(tag_name); + + // PERFORMANCE: Use string_view for content extraction + size_t content_start = open_end + 1; + size_t content_length = close_pos - content_start; + if (content_length > 0) { + std::string_view content_view = text.substr(content_start, content_length); + tag.content = std::string(content_view); + } + + return tag; + } + + return std::nullopt; + } + + // Find all XML tags with a specific name and attribute pattern - with limits, using string_view + std::vector find_all_xml_tags(std::string_view text, std::string_view tag_name, + common_chat_msg_parser::XmlParseError * error = nullptr) { + std::vector tags; + size_t pos = 0; + size_t tag_count = 0; + + while (pos < text.length() && tag_count < MAX_PARAMETER_COUNT) { + auto tag = find_xml_tag(text, tag_name, pos, error); + if (!tag) { + break; + } + tags.push_back(*tag); + pos = tag->end_pos; + ++tag_count; + } + + if (tag_count >= MAX_PARAMETER_COUNT) { + LOG_DBG("Too many tags found: %zu (max: %zu)\n", tag_count, MAX_PARAMETER_COUNT); + if (error) { + set_error(*error, common_chat_msg_parser::XmlParseErrorType::TOO_MANY_PARAMETERS, pos, + std::string(text.substr(pos, std::min(text.size() - pos, size_t(100)))), + "Too many " + std::string(tag_name) + " tags found (max: " + std::to_string(MAX_PARAMETER_COUNT) + ")"); + } + } + + return tags; + } + + // Trim whitespace from string using string_view for performance + std::string trim_whitespace(std::string_view str) { + size_t start = str.find_first_not_of(" \t\n\r"); + if (start == std::string::npos) { + return ""; + } + size_t end = str.find_last_not_of(" \t\n\r"); + return std::string(str.substr(start, end - start + 1)); + } + + // Safe integer parsing with overflow protection using string_view + bool safe_parse_int(std::string_view str, int & result) { + try { + // Check for potential overflow by using long long first + std::string str_copy(str); // stoll requires std::string + long long temp = std::stoll(str_copy); + if (temp > std::numeric_limits::max() || temp < std::numeric_limits::min()) { + return false; // Overflow + } + result = static_cast(temp); + return true; + } catch (const std::exception &) { + return false; + } + } + + // Safe float parsing with overflow protection using string_view + bool safe_parse_float(std::string_view str, float & result) { + try { + std::string str_copy(str); // stod requires std::string + double temp = std::stod(str_copy); + if (temp > std::numeric_limits::max() || temp < std::numeric_limits::lowest()) { + return false; // Overflow + } + result = static_cast(temp); + return true; + } catch (const std::exception &) { + return false; + } + } + + // Convert parameter value based on tool schema type - FIXED JSON injection vulnerability, using string_view + std::string convert_qwen3_param_value(std::string_view param_value, + std::string_view param_name, + const nlohmann::json & param_config, + std::string_view /* func_name */) { + std::string trimmed_value = trim_whitespace(param_value); + + // Handle null value + if (trimmed_value == "null") { + return "null"; + } + + // If we have schema information, use it + if (param_config.contains(param_name)) { + const auto & schema = param_config.at(std::string(param_name)); + if (schema.contains("type")) { + const auto & t = schema.at("type"); + // Handle union types like ["number","null"] + if (t.is_array()) { + std::vector types; + for (const auto & tv : t) { + if (tv.is_string()) { + types.push_back((std::string) tv); + } + } + auto list_contains = [&](const char * s) { + for (const auto & x : types) { + if (x == s) return true; + } + return false; + }; + auto has = [&](std::string_view ty) { + for (const auto & s : types) { + if (s == ty) return true; + } + // Back-compat synonyms + if (ty == "string") return list_contains("str") || list_contains("text"); + if (ty == "integer") return list_contains("int"); + if (ty == "number") return list_contains("float"); + if (ty == "boolean") return list_contains("bool"); + return false; + }; + if (has("null") && trimmed_value == "null") { + return "null"; + } + if (has("object") || has("array")) { + try { + auto parsed = json::parse(trimmed_value); + return parsed.dump(); + } catch (...) { + return json(trimmed_value).dump(); + } + } + if (has("integer")) { + int int_val; + if (safe_parse_int(trimmed_value, int_val)) { + return std::to_string(int_val); + } + // if integer parse fails, try number or fall through + } + if (has("number")) { + float float_val; + if (safe_parse_float(trimmed_value, float_val)) { + return std::to_string(float_val); + } + } + if (has("boolean")) { + if (trimmed_value == "true" || trimmed_value == "false") { + return trimmed_value; + } + return "false"; + } + if (has("string")) { + return json(trimmed_value).dump(); + } + // Unknown union types: fall through to generic inference below + } else if (t.is_string()) { + std::string param_type = t; + // Convert based on type + if (param_type == "string" || param_type == "str" || param_type == "text") { + // SECURITY FIX: Use nlohmann::json for proper escaping instead of manual concatenation + return json(trimmed_value).dump(); + } else if (param_type == "integer" || param_type == "int") { + int int_val; + if (safe_parse_int(trimmed_value, int_val)) { + return std::to_string(int_val); + } else { + // SECURITY FIX: Use proper JSON escaping for fallback string + return json(trimmed_value).dump(); + } + } else if (param_type == "number" || param_type == "float") { + float float_val; + if (safe_parse_float(trimmed_value, float_val)) { + return std::to_string(float_val); + } else { + // SECURITY FIX: Use proper JSON escaping for fallback string + return json(trimmed_value).dump(); + } + } else if (param_type == "boolean" || param_type == "bool") { + if (trimmed_value == "true" || trimmed_value == "false") { + return trimmed_value; + } + return "false"; + } else if (param_type == "object" || param_type == "array") { + try { + auto parsed = json::parse(trimmed_value); + return parsed.dump(); + } catch (...) { + // SECURITY FIX: Use proper JSON escaping for fallback string + return json(trimmed_value).dump(); + } + } + } + // If schema.type exists but is not string/array, fall through + } + } + + // Without schema, try to infer type from value + // First check if it's valid JSON (object or array) + try { + auto parsed_json = json::parse(trimmed_value); + return parsed_json.dump(); // It's valid JSON, return as-is + } catch (...) { + // Not valid JSON, continue with other type checks + } + + // Check if it's a number + int int_val; + if (safe_parse_int(trimmed_value, int_val)) { + return std::to_string(int_val); // It's an integer + } + + float float_val; + if (safe_parse_float(trimmed_value, float_val)) { + return std::to_string(float_val); // It's a float + } + + // Check if it's a boolean + if (trimmed_value == "true" || trimmed_value == "false") { + return trimmed_value; + } + + // Default to string - SECURITY FIX: Use proper JSON escaping + return json(trimmed_value).dump(); + } + + // Get parameter configuration from tools using string_view + nlohmann::json get_param_config(std::string_view func_name, + const std::vector & tools) { + for (const auto & tool : tools) { + if (tool.name == func_name) { + try { + auto params = json::parse(tool.parameters); + if (params.contains("properties")) { + return params["properties"]; + } + return params; + } catch (...) { + return json::object(); + } + } + } + return json::object(); + } +} + +bool common_chat_msg_parser::parse_qwen3_xml_tool_call(const std::string & content, + const std::vector & tools) { + XmlParseError error; + bool result = parse_qwen3_xml_tool_call(content, tools, error); + last_xml_error_ = error; + return result; +} + +bool common_chat_msg_parser::parse_qwen3_xml_tool_call(const std::string & content, + const std::vector & tools, + XmlParseError & error) { + // Clear any previous error + error.clear(); + + // Input validation for DoS protection + if (content.size() > MAX_INPUT_SIZE) { + LOG_DBG("XML content too large: %zu bytes (max: %zu)\n", content.size(), MAX_INPUT_SIZE); + set_error(error, XmlParseErrorType::INPUT_TOO_LARGE, 0, + content.substr(0, std::min(content.size(), size_t(100))), + "XML content exceeds maximum size limit of " + std::to_string(MAX_INPUT_SIZE) + " bytes"); + return false; + } + + // Validate tools vector size + if (tools.size() > MAX_PARAMETER_COUNT) { + LOG_DBG("Too many tools provided: %zu (max: %zu)\n", tools.size(), MAX_PARAMETER_COUNT); + set_error(error, XmlParseErrorType::TOO_MANY_TOOLS, 0, "", + "Too many tools provided: " + std::to_string(tools.size()) + " (max: " + std::to_string(MAX_PARAMETER_COUNT) + ")"); + return false; + } + + // PERFORMANCE OPTIMIZATION: Create hash set for O(1) function lookup + std::unordered_set valid_functions; + if (!tools.empty()) { + valid_functions.reserve(tools.size()); + for (const auto & tool : tools) { + valid_functions.insert(tool.name); + } + } + + // PERFORMANCE: Use string_view to avoid unnecessary string copies + std::string_view content_view(content); + + // Find tool_call tag + auto tool_call_tag = find_xml_tag(content_view, "tool_call", 0, &error); + if (!tool_call_tag) { + if (!error.has_error()) { + set_error(error, XmlParseErrorType::INVALID_XML_STRUCTURE, 0, content.substr(0, std::min(content.size(), size_t(100))), + "No valid tag found in content"); + } + return false; + } + + // Extract content before the tool call - with bounds checking + if (tool_call_tag->start_pos > 0 && tool_call_tag->start_pos <= content.size()) { + std::string content_before = content.substr(0, tool_call_tag->start_pos); + // Don't trim whitespace here as it might be significant for the content + if (!content_before.empty()) { + add_content(content_before); + } + } + + if (!tool_call_tag->end_pos) { + return true; + } + + // Find function tag within tool_call - use string_view for performance + std::string_view tool_call_content_view(tool_call_tag->content); + auto function_tag = find_xml_tag(tool_call_content_view, "function", 0, &error); + if (!function_tag || function_tag->attribute.empty()) { + LOG_DBG("Invalid or missing function tag in tool_call\n"); + if (!error.has_error()) { + set_error(error, XmlParseErrorType::INVALID_XML_STRUCTURE, tool_call_tag->start_pos, + tool_call_tag->content.substr(0, std::min(tool_call_tag->content.size(), size_t(100))), + "Invalid or missing tag with attribute in "); + } + return false; + } + + std::string function_name = trim_whitespace(function_tag->attribute); + + // Validate function name + if (function_name.empty() || function_name.size() > MAX_TAG_NAME_LENGTH) { + LOG_DBG("Invalid function name: '%s' (length: %zu, max: %zu)\n", + function_name.c_str(), function_name.size(), MAX_TAG_NAME_LENGTH); + set_error(error, XmlParseErrorType::INVALID_FUNCTION_NAME, + tool_call_tag->start_pos + function_tag->start_pos, + function_name, + "Invalid function name: '" + function_name + "' (length: " + std::to_string(function_name.size()) + ", max: " + std::to_string(MAX_TAG_NAME_LENGTH) + ")"); + return false; + } + + // PERFORMANCE OPTIMIZATION: Use hash set for O(1) function lookup instead of O(n) loop + if (!tools.empty() && valid_functions.find(function_name) == valid_functions.end()) { + LOG_DBG("Function '%s' not found in available tools\n", function_name.c_str()); + set_error(error, XmlParseErrorType::FUNCTION_NOT_FOUND, + tool_call_tag->start_pos + function_tag->start_pos, + function_name, + "Function '" + function_name + "' not found in available tools"); + return false; + } + + // Get parameter configuration for this function - use string_view + auto param_config = get_param_config(std::string_view(function_name), tools); + + // Parse parameters within function tag - use string_view for performance + json arguments = json::object(); + std::string_view function_content_view(function_tag->content); + auto parameter_tags = find_all_xml_tags(function_content_view, "parameter", &error); + + // Check if error occurred during parameter parsing + if (error.has_error()) { + return false; + } + + // Limit parameter count for DoS protection + size_t param_count = 0; + for (const auto & param_tag : parameter_tags) { + if (param_count >= MAX_PARAMETER_COUNT) { + LOG_DBG("Too many parameters for function '%s': %zu (max: %zu)\n", + function_name.c_str(), param_count, MAX_PARAMETER_COUNT); + set_error(error, XmlParseErrorType::TOO_MANY_PARAMETERS, + tool_call_tag->start_pos + function_tag->start_pos, + function_name, + "Too many parameters for function '" + function_name + "': " + std::to_string(param_count) + " (max: " + std::to_string(MAX_PARAMETER_COUNT) + ")"); + break; + } + + if (param_tag.attribute.empty()) { + LOG_DBG("Skipping parameter with empty attribute\n"); + continue; // Skip malformed parameter tags + } + + std::string param_name = trim_whitespace(param_tag.attribute); + std::string param_value = param_tag.content; + + // Validate parameter name + if (param_name.empty() || param_name.size() > MAX_TAG_NAME_LENGTH) { + LOG_DBG("Invalid parameter name: '%s' (length: %zu, max: %zu)\n", + param_name.c_str(), param_name.size(), MAX_TAG_NAME_LENGTH); + continue; + } + + // Convert value based on schema type - use string_view for performance + try { + std::string converted_value = convert_qwen3_param_value( + std::string_view(param_value), + std::string_view(param_name), + param_config, + std::string_view(function_name) + ); + arguments[param_name] = json::parse(converted_value); + ++param_count; + } catch (const std::exception & e) { + LOG_DBG("Failed to convert parameter '%s': %s, using raw value\n", param_name.c_str(), e.what()); + set_error(error, XmlParseErrorType::PARAMETER_CONVERSION_FAILED, + tool_call_tag->start_pos + function_tag->start_pos + param_tag.start_pos, + param_name + "=" + param_value, + "Failed to convert parameter '" + param_name + "': " + e.what()); + // Fallback to trimmed raw value with proper JSON escaping + arguments[param_name] = trim_whitespace(param_value); + ++param_count; + } + } + + // Add the tool call with error handling + try { + std::string args_json = arguments.dump(); + return add_tool_call(function_name, "", args_json); + } catch (const std::exception & e) { + LOG_DBG("Failed to serialize arguments for function '%s': %s\n", function_name.c_str(), e.what()); + set_error(error, XmlParseErrorType::JSON_SERIALIZATION_FAILED, + tool_call_tag->start_pos, + function_name, + "Failed to serialize arguments for function '" + function_name + "': " + e.what()); + return false; + } +} + diff --git a/common/chat-parser.h b/common/chat-parser.h index c8cdc63fb50f6..81decdf9bf3fc 100644 --- a/common/chat-parser.h +++ b/common/chat-parser.h @@ -8,6 +8,7 @@ #include #include +#include #include class common_chat_msg_partial_exception : public std::runtime_error { @@ -120,4 +121,45 @@ class common_chat_msg_parser { ); void clear_tools(); + + // Error reporting for XML parser + enum class XmlParseErrorType { + NONE, + INPUT_TOO_LARGE, + TAG_NAME_TOO_LONG, + ATTRIBUTE_TOO_LONG, + TOO_MANY_PARAMETERS, + TOO_MANY_TOOLS, + INVALID_XML_STRUCTURE, + FUNCTION_NOT_FOUND, + INVALID_FUNCTION_NAME, + PARAMETER_CONVERSION_FAILED, + JSON_SERIALIZATION_FAILED + }; + + struct XmlParseError { + XmlParseErrorType type = XmlParseErrorType::NONE; + size_t position = 0; + std::string context; + std::string message; + + bool has_error() const { return type != XmlParseErrorType::NONE; } + void clear() { + type = XmlParseErrorType::NONE; + position = 0; + context.clear(); + message.clear(); + } + }; + + // Qwen3-Coder XML tool call parser with error reporting + bool parse_qwen3_xml_tool_call(const std::string & content, const std::vector & tools); + bool parse_qwen3_xml_tool_call(const std::string & content, const std::vector & tools, XmlParseError & error); + + // Get last parse error + const XmlParseError & get_last_xml_parse_error() const { return last_xml_error_; } + +private: + XmlParseError last_xml_error_; }; + diff --git a/common/chat.cpp b/common/chat.cpp index 87212322ec248..9bb29a33da30d 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -639,6 +640,7 @@ const char * common_chat_format_name(common_chat_format format) { case COMMON_CHAT_FORMAT_SEED_OSS: return "Seed-OSS"; case COMMON_CHAT_FORMAT_NEMOTRON_V2: return "Nemotron V2"; case COMMON_CHAT_FORMAT_APERTUS: return "Apertus"; + case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: return "Qwen3 Coder XML"; default: throw std::runtime_error("Unknown chat format"); } @@ -2397,6 +2399,7 @@ static void common_chat_parse_nemotron_v2(common_chat_msg_parser & builder) { static void common_chat_parse_apertus(common_chat_msg_parser & builder) { // Parse thinking tags builder.try_parse_reasoning("<|inner_prefix|>", "<|inner_suffix|>"); + if (!builder.syntax().parse_tool_calls) { builder.add_content(builder.consume_rest()); return; @@ -2425,6 +2428,188 @@ static void common_chat_parse_apertus(common_chat_msg_parser & builder) { builder.add_content(builder.consume_rest()); } +static common_chat_params common_chat_params_init_qwen3_coder_xml(const common_chat_template & tmpl, const struct templates_params & inputs) { + common_chat_params data; + data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; + + // Always set the format to QWEN3_CODER_XML regardless of whether tools are provided + // The format identifies the template type, not the runtime configuration + data.format = COMMON_CHAT_FORMAT_QWEN3_CODER_XML; + + if (!inputs.tools.empty()) { + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + std::vector tool_rules; + + auto not_parameter_end = builder.add_rule("not_parameter_end", "([^<] | (\"<\" [^/]) | (\"]))*"); + + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + const std::string & name = function.at("name"); + auto parameters = function.at("parameters"); + builder.resolve_refs(parameters); + + std::unordered_set required; + if (parameters.contains("required")) { + for (const auto & p : parameters.at("required")) { + required.insert(p); + } + } + + // Build parameter rules for XML format + std::vector param_rules; + if (parameters.contains("properties")) { + for (const auto & [param_name, param_schema] : parameters["properties"].items()) { + std::string param_rule = "\"\" space "; + + // Add parameter value based on type (supports unions and anyOf/oneOf; sanitize unsupported {"not":{}} branches) + auto schema_local = param_schema; + + // Recursively remove entries like {"not":{}} inside anyOf/oneOf that json-schema-to-grammar doesn't support + std::function sanitize = [&](json &s) { + if (s.is_object()) { + if (s.contains("anyOf") && s["anyOf"].is_array()) { + json filtered = json::array(); + for (auto v : s["anyOf"]) { + if (v.is_object() && v.contains("not") && v["not"].is_object() && v["not"].empty()) { + continue; + } + sanitize(v); + filtered.push_back(v); + } + s["anyOf"] = filtered; + if (s["anyOf"].size() == 1) { + json single = s["anyOf"][0]; + s.erase("anyOf"); + for (auto it = single.begin(); it != single.end(); ++it) { + s[it.key()] = it.value(); + } + } + } + if (s.contains("oneOf") && s["oneOf"].is_array()) { + json filtered = json::array(); + for (auto v : s["oneOf"]) { + if (v.is_object() && v.contains("not") && v["not"].is_object() && v["not"].empty()) { + continue; + } + sanitize(v); + filtered.push_back(v); + } + s["oneOf"] = filtered; + if (s["oneOf"].size() == 1) { + json single = s["oneOf"][0]; + s.erase("oneOf"); + for (auto it = single.begin(); it != single.end(); ++it) { + s[it.key()] = it.value(); + } + } + } + for (auto it = s.begin(); it != s.end(); ++it) { + sanitize(it.value()); + } + } else if (s.is_array()) { + for (auto & v : s) sanitize(v); + } + }; + sanitize(schema_local); + + // Determine if schema allows a plain string (so we can accept unquoted text content in XML) + std::function allows_string = [&](const json & sch) -> bool { + if (!sch.is_object()) return false; + if (sch.contains("type")) { + const auto & t = sch.at("type"); + if (t.is_string()) { + std::string ts = t; + return ts == "string" || ts == "text" || ts == "str"; + } + if (t.is_array()) { + for (const auto & tv : t) { + if (tv.is_string() && (tv == "string" || tv == "text" || tv == "str")) { + return true; + } + } + } + } + if (sch.contains("anyOf") && sch["anyOf"].is_array()) { + for (const auto & v : sch["anyOf"]) { + if (allows_string(v)) return true; + } + } + if (sch.contains("oneOf") && sch["oneOf"].is_array()) { + for (const auto & v : sch["oneOf"]) { + if (allows_string(v)) return true; + } + } + return false; + }; + + if (allows_string(schema_local)) { + // For string-accepting schemas, keep freeform XML text (no JSON quoting) + param_rule += not_parameter_end; + } else { + // For non-strings (object/array/number/boolean/null), expect JSON per schema + param_rule += builder.add_schema(name + "-parameter-" + param_name, schema_local); + } + + param_rule += "\"\" space"; + + // Parameter is optional + if (required.find(param_name) == required.end()) { + param_rule = "(" + param_rule + ")? "; + } + + param_rules.push_back(param_rule); + } + } + + std::string function_content = param_rules.empty() ? "space" : string_join(param_rules, " "); + tool_rules.push_back(builder.add_rule(name + "-call", + "\"\" space \"\" space " + + function_content + " \"\" space \"\" space")); + }); + + auto tool_call = builder.add_rule("tool_call", string_join(tool_rules, " | ")); + builder.add_rule("root", inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call); + + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ""}); + data.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "", + "", + "", + "", + }; + } else { + // When no tools are provided, disable lazy grammar to avoid "no triggers set" error + data.grammar_lazy = false; + } + + data.prompt = apply(tmpl, inputs); + return data; +} + +static void common_chat_parse_qwen3_coder_xml(common_chat_msg_parser & builder) { + if (!builder.syntax().parse_tool_calls) { + builder.add_content(builder.consume_rest()); + return; + } + + std::string content = builder.consume_rest(); + + // Try to parse Qwen3-Coder XML format + // For now, use empty tools vector - we'll need to pass tools differently + std::vector empty_tools; + if (builder.parse_qwen3_xml_tool_call(content, empty_tools)) { + // Successfully parsed XML tool call + return; + } + // If no tool call found, treat as regular content + builder.add_content(content); +} + static void common_chat_parse_seed_oss(common_chat_msg_parser & builder) { // Parse thinking tags first - this handles the main reasoning content builder.try_parse_reasoning("", ""); @@ -2644,6 +2829,15 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_command_r7b(tmpl, params); } + // Qwen3-Coder XML format detection (must come before Hermes 2 Pro) + // Detect via explicit XML markers unique to Qwen3-Coder to avoid false positives in other templates. + // Require presence of , , and blocks. + if (src.find("") != std::string::npos && + src.find("") != std::string::npos) { return common_chat_params_init_granite(tmpl, params); @@ -2712,6 +2906,7 @@ static common_chat_params common_chat_templates_apply_jinja( return common_chat_params_init_mistral_nemo(tmpl, params); } + // Generic fallback return common_chat_params_init_generic(tmpl, params); } @@ -2844,6 +3039,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) { case COMMON_CHAT_FORMAT_APERTUS: common_chat_parse_apertus(builder); break; + case COMMON_CHAT_FORMAT_QWEN3_CODER_XML: + common_chat_parse_qwen3_coder_xml(builder); + break; default: throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format)); } diff --git a/common/chat.h b/common/chat.h index 3c277e15eba7f..69924c00dcd18 100644 --- a/common/chat.h +++ b/common/chat.h @@ -110,6 +110,7 @@ enum common_chat_format { COMMON_CHAT_FORMAT_DEEPSEEK_V3_1, COMMON_CHAT_FORMAT_HERMES_2_PRO, COMMON_CHAT_FORMAT_COMMAND_R7B, + COMMON_CHAT_FORMAT_QWEN3_CODER_XML, COMMON_CHAT_FORMAT_GRANITE, COMMON_CHAT_FORMAT_GPT_OSS, COMMON_CHAT_FORMAT_SEED_OSS, diff --git a/models/templates/Qwen3-Coder.jinja b/models/templates/Qwen3-Coder.jinja new file mode 100644 index 0000000000000..49b0e8d0ee7e6 --- /dev/null +++ b/models/templates/Qwen3-Coder.jinja @@ -0,0 +1,117 @@ +{% macro render_extra_keys(json_dict, handled_keys) %} + {%- if json_dict is mapping %} + {%- for json_key in json_dict if json_key not in handled_keys %} + {%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %} + {{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '' }} + {%- else %} + {{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '' }} + {%- endif %} + {%- endfor %} + {%- endif %} +{% endmacro %} + +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} +{%- endif %} + +{%- if not tools is defined %} + {%- set tools = [] %} +{%- endif %} + +{%- if system_message is defined %} + {{- "<|im_start|>system\n" + system_message }} +{%- else %} + {%- if tools is iterable and tools | length > 0 %} + {{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }} + {%- endif %} +{%- endif %} +{%- if tools is iterable and tools | length > 0 %} + {{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }} + {{- "" }} + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- "\n\n" ~ tool.name ~ "" }} + {%- if tool.description is defined %} + {{- '\n' ~ (tool.description | trim) ~ '' }} + {%- endif %} + {{- '\n' }} + {%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- '\n' }} + {{- '\n' ~ param_name ~ '' }} + {%- if param_fields.type is defined %} + {{- '\n' ~ (param_fields.type | string) ~ '' }} + {%- endif %} + {%- if param_fields.description is defined %} + {{- '\n' ~ (param_fields.description | trim) ~ '' }} + {%- endif %} + {%- set handled_keys = ['name', 'type', 'description'] %} + {{- render_extra_keys(param_fields, handled_keys) }} + {{- '\n' }} + {%- endfor %} + {%- endif %} + {% set handled_keys = ['type', 'properties'] %} + {{- render_extra_keys(tool.parameters, handled_keys) }} + {{- '\n' }} + {%- set handled_keys = ['type', 'name', 'description', 'parameters'] %} + {{- render_extra_keys(tool, handled_keys) }} + {{- '\n' }} + {%- endfor %} + {{- "\n" }} + {{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n\n\n\nvalue_1\n\n\nThis is the value for the second parameter\nthat can span\nmultiple lines\n\n\n\n\n\nReminder:\n- Function calls MUST follow the specified format: an inner block must be nested within XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n' }} +{%- endif %} +{%- if system_message is defined %} + {{- '<|im_end|>\n' }} +{%- else %} + {%- if tools is iterable and tools | length > 0 %} + {{- '<|im_end|>\n' }} + {%- endif %} +{%- endif %} +{%- for message in loop_messages %} + {%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %} + {{- '<|im_start|>' + message.role }} + {%- if message.content is defined and message.content is string and message.content | trim | length > 0 %} + {{- '\n' + message.content | trim + '\n' }} + {%- endif %} + {%- for tool_call in message.tool_calls %} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '\n\n\n' }} + {%- if tool_call.arguments is defined %} + {%- for args_name, args_value in tool_call.arguments|items %} + {{- '\n' }} + {%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %} + {{- args_value }} + {{- '\n\n' }} + {%- endfor %} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>user\n' }} + {%- endif %} + {{- '\n' }} + {{- message.content }} + {{- '\n\n' }} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>\n' }} + {%- elif loop.last %} + {{- '<|im_end|>\n' }} + {%- endif %} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index 9cd67e3ef49d3..2182700b2c5c4 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1388,6 +1388,675 @@ static void test_template_output_parsers() { "{\"arg1\": 1}\n" "```<|tool▁call▁end|><|tool▁calls▁end|>"); } + + // Test Qwen3-Coder XML format - Comprehensive test suite + { + printf("Testing Qwen3-Coder XML format - Comprehensive Suite\n"); + + // Test 1: Basic XML tool call parsing + assert_msg_equals( + message_assist_call, + common_chat_parse( + "\n" + " \n" + " \n" + " 1\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 2: Multiple parameters with different types + common_chat_msg expected_multi_param; + expected_multi_param.role = "assistant"; + expected_multi_param.tool_calls = { + { "complex_function", "{\"name\":\"John Doe\",\"age\":30,\"active\":true,\"score\":95.5}", "" } + }; + + assert_msg_equals( + expected_multi_param, + common_chat_parse( + "\n" + " \n" + " \n" + " John Doe\n" + " \n" + " \n" + " 30\n" + " \n" + " \n" + " true\n" + " \n" + " \n" + " 95.5\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 3: Special characters and Unicode + common_chat_msg expected_special_chars; + expected_special_chars.role = "assistant"; + expected_special_chars.tool_calls = { + { "unicode_function", "{\"message\":\"Hello 世界! 🌍 Special chars: @#$%^&*()\"}", "" } + }; + + assert_msg_equals( + expected_special_chars, + common_chat_parse( + "\n" + " \n" + " \n" + " Hello 世界! 🌍 Special chars: @#$%^&*()\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 4: Multiline content with newlines and indentation + common_chat_msg expected_multiline; + expected_multiline.role = "assistant"; + expected_multiline.tool_calls = { + { "code_function", "{\"code\":\"def hello():\\n print(\\\"Hello, World!\\\")\\n return True\"}", "" } + }; + + assert_msg_equals( + expected_multiline, + common_chat_parse( + "\n" + " \n" + " \n" + "def hello():\n" + " print(\"Hello, World!\")\n" + " return True\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 5: JSON object as parameter value + common_chat_msg expected_json_param; + expected_json_param.role = "assistant"; + expected_json_param.tool_calls = { + { "json_function", "{\"config\":{\"host\":\"localhost\",\"port\":8080,\"ssl\":false}}", "" } + }; + + assert_msg_equals( + expected_json_param, + common_chat_parse( + "\n" + " \n" + " \n" + " {\"host\": \"localhost\", \"port\": 8080, \"ssl\": false}\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 6: Array as parameter value + common_chat_msg expected_array_param; + expected_array_param.role = "assistant"; + expected_array_param.tool_calls = { + { "array_function", "{\"items\":[\"apple\",\"banana\",\"cherry\"]}", "" } + }; + + assert_msg_equals( + expected_array_param, + common_chat_parse( + "\n" + " \n" + " \n" + " [\"apple\", \"banana\", \"cherry\"]\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 7: Empty parameter + common_chat_msg expected_empty_param; + expected_empty_param.role = "assistant"; + expected_empty_param.tool_calls = { + { "empty_function", "{\"empty_param\":\"\"}", "" } + }; + + assert_msg_equals( + expected_empty_param, + common_chat_parse( + "\n" + " \n" + " \n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 8: Boolean values (true/false) + common_chat_msg expected_boolean; + expected_boolean.role = "assistant"; + expected_boolean.tool_calls = { + { "boolean_function", "{\"enabled\":true,\"debug\":false}", "" } + }; + + assert_msg_equals( + expected_boolean, + common_chat_parse( + "\n" + " \n" + " \n" + " true\n" + " \n" + " \n" + " false\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 9: Null value + common_chat_msg expected_null; + expected_null.role = "assistant"; + expected_null.tool_calls = { + { "null_function", "{\"optional_param\":null}", "" } + }; + + assert_msg_equals( + expected_null, + common_chat_parse( + "\n" + " \n" + " \n" + " null\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 10: Negative numbers and scientific notation + common_chat_msg expected_numbers; + expected_numbers.role = "assistant"; + expected_numbers.tool_calls = { + { "math_function", "{\"negative\":-42,\"decimal\":-3.14,\"scientific\":1.23e-4}", "" } + }; + + assert_msg_equals( + expected_numbers, + common_chat_parse( + "\n" + " \n" + " \n" + " -42\n" + " \n" + " \n" + " -3.14\n" + " \n" + " \n" + " 1.23e-4\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 11: XML-like content in parameters (should be escaped) + common_chat_msg expected_xml_content; + expected_xml_content.role = "assistant"; + expected_xml_content.tool_calls = { + { "xml_function", "{\"xml_content\":\"value\"}", "" } + }; + + assert_msg_equals( + expected_xml_content, + common_chat_parse( + "\n" + " \n" + " \n" + " value\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 12: Quotes and escape characters + common_chat_msg expected_quotes; + expected_quotes.role = "assistant"; + expected_quotes.tool_calls = { + { "quote_function", "{\"message\":\"She said \\\"Hello!\\\" and left.\"}", "" } + }; + + assert_msg_equals( + expected_quotes, + common_chat_parse( + "\n" + " \n" + " \n" + " She said \"Hello!\" and left.\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 13: Long parameter value (simplified) + std::string long_text = "This is a long text parameter that should test the parser's ability to handle larger amounts of text data."; + + common_chat_msg expected_long_text; + expected_long_text.role = "assistant"; + expected_long_text.tool_calls = { + { "long_function", "{\"long_text\":\"" + long_text + "\"}", "" } + }; + + assert_msg_equals( + expected_long_text, + common_chat_parse( + "\n" + " \n" + " \n" + " " + long_text + "\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 14: Mixed content with text before and after tool call + common_chat_msg expected_mixed_content; + expected_mixed_content.role = "assistant"; + expected_mixed_content.content = "I'll help you search for products. "; + expected_mixed_content.tool_calls = { + { "search_function", "{\"query\":\"laptops\"}", "" } + }; + + assert_msg_equals( + expected_mixed_content, + common_chat_parse( + "I'll help you search for products. \n" + " \n" + " \n" + " laptops\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 15: Compact format (no extra whitespace) + common_chat_msg expected_compact; + expected_compact.role = "assistant"; + expected_compact.tool_calls = { + { "compact_function", "{\"param\":\"value\"}", "" } + }; + + assert_msg_equals( + expected_compact, + common_chat_parse( + "value", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 16: Function name with underscores and numbers + common_chat_msg expected_complex_name; + expected_complex_name.role = "assistant"; + expected_complex_name.tool_calls = { + { "get_user_data_v2", "{\"user_id\":12345}", "" } + }; + + assert_msg_equals( + expected_complex_name, + common_chat_parse( + "\n" + " \n" + " \n" + " 12345\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 17: Parameter names with underscores and numbers + common_chat_msg expected_complex_params; + expected_complex_params.role = "assistant"; + expected_complex_params.tool_calls = { + { "test_function", "{\"param_1\":\"value1\",\"param_2_name\":\"value2\",\"param3\":123}", "" } + }; + + assert_msg_equals( + expected_complex_params, + common_chat_parse( + "\n" + " \n" + " \n" + " value1\n" + " \n" + " \n" + " value2\n" + " \n" + " \n" + " 123\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + printf("✅ All Qwen3-Coder XML format tests passed!\n"); + } + + // Test Qwen3-Coder XML format - Error handling and edge cases + { + printf("Testing Qwen3-Coder XML format - Error handling and edge cases\n"); + + // Test 1: No tool_call tags (should be treated as regular content) + common_chat_msg expected_no_tool_call; + expected_no_tool_call.role = "assistant"; + expected_no_tool_call.content = "This is just regular text without any tool calls."; + + assert_msg_equals( + expected_no_tool_call, + common_chat_parse( + "This is just regular text without any tool calls.", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 2: Empty function name (should fall back to content) + common_chat_msg expected_empty_function; + expected_empty_function.role = "assistant"; + expected_empty_function.content = ""; + + assert_msg_equals( + expected_empty_function, + common_chat_parse( + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 3: Malformed parameter tags (should still parse function but ignore malformed params) + common_chat_msg expected_malformed_params; + expected_malformed_params.role = "assistant"; + expected_malformed_params.tool_calls = { + { "test", "{}", "" } // Empty arguments since parameter is malformed + }; + + assert_msg_equals( + expected_malformed_params, + common_chat_parse( + "no name", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 4: Nested tool calls (should parse the first one) + common_chat_msg expected_nested; + expected_nested.role = "assistant"; + expected_nested.tool_calls = { + { "outer_function", "{\"param\":\"value\"}", "" } + }; + + assert_msg_equals( + expected_nested, + common_chat_parse( + "\n" + " \n" + " \n" + " value\n" + " \n" + " \n" + "\n" + "\n" + " \n" + " \n" + " value2\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 5: Very deeply nested XML content in parameter + common_chat_msg expected_deep_xml; + expected_deep_xml.role = "assistant"; + expected_deep_xml.tool_calls = { + { "xml_parser", "{\"xml\":\"deep content\"}", "" } + }; + + assert_msg_equals( + expected_deep_xml, + common_chat_parse( + "\n" + " \n" + " \n" + " deep content\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 6: Parameter with only whitespace + common_chat_msg expected_whitespace_param; + expected_whitespace_param.role = "assistant"; + expected_whitespace_param.tool_calls = { + { "whitespace_function", "{\"spaces\":\"\"}", "" } + }; + + assert_msg_equals( + expected_whitespace_param, + common_chat_parse( + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 7: Parameter with tabs and mixed whitespace + common_chat_msg expected_mixed_whitespace; + expected_mixed_whitespace.role = "assistant"; + expected_mixed_whitespace.tool_calls = { + { "tab_function", "{\"content\":\"line1\\n\\tindented line\\n spaces\"}", "" } + }; + + assert_msg_equals( + expected_mixed_whitespace, + common_chat_parse( + "\n" + " \n" + " \n" + "line1\n" + "\tindented line\n" + " spaces\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 8: Control characters and special Unicode + common_chat_msg expected_control_chars; + expected_control_chars.role = "assistant"; + expected_control_chars.tool_calls = { + { "control_function", "{\"text\":\"Line1\\nLine2\\tTabbed\\rCarriage return\"}", "" } + }; + + assert_msg_equals( + expected_control_chars, + common_chat_parse( + "\n" + " \n" + " \n" + "Line1\nLine2\tTabbed\rCarriage return\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 9: Emoji and extended Unicode characters + common_chat_msg expected_emoji; + expected_emoji.role = "assistant"; + expected_emoji.tool_calls = { + { "emoji_function", "{\"message\":\"Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\"}", "" } + }; + + assert_msg_equals( + expected_emoji, + common_chat_parse( + "\n" + " \n" + " \n" + " Hello! 👋 🌟 🚀 Testing emojis: 😀😃😄😁 and symbols: ∑∏∆∇\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 10: Mathematical expressions and formulas + common_chat_msg expected_math; + expected_math.role = "assistant"; + expected_math.tool_calls = { + { "math_function", "{\"formula\":\"E = mc² and ∫f(x)dx = F(x) + C\"}", "" } + }; + + assert_msg_equals( + expected_math, + common_chat_parse( + "\n" + " \n" + " \n" + " E = mc² and ∫f(x)dx = F(x) + C\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 11: SQL injection-like content (should be safely escaped) + common_chat_msg expected_sql; + expected_sql.role = "assistant"; + expected_sql.tool_calls = { + { "sql_function", "{\"query\":\"SELECT * FROM users WHERE id = 1; DROP TABLE users; --\"}", "" } + }; + + assert_msg_equals( + expected_sql, + common_chat_parse( + "\n" + " \n" + " \n" + " SELECT * FROM users WHERE id = 1; DROP TABLE users; --\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 12: HTML/XML injection content + common_chat_msg expected_html; + expected_html.role = "assistant"; + expected_html.tool_calls = { + { "html_function", "{\"content\":\"\"}", "" } + }; + + assert_msg_equals( + expected_html, + common_chat_parse( + "\n" + " \n" + " \n" + " \n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 13: Binary-like content (base64) + common_chat_msg expected_binary; + expected_binary.role = "assistant"; + expected_binary.tool_calls = { + { "binary_function", "{\"data\":\"SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\"}", "" } + }; + + assert_msg_equals( + expected_binary, + common_chat_parse( + "\n" + " \n" + " \n" + " SGVsbG8gV29ybGQhIFRoaXMgaXMgYmFzZTY0IGVuY29kZWQgdGV4dC4=\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + // Test 14: Very large numbers (should be parsed as scientific notation) + common_chat_msg expected_large_numbers; + expected_large_numbers.role = "assistant"; + expected_large_numbers.tool_calls = { + { "number_function", "{\"big_int\":1e+60}", "" } // Large number becomes scientific notation + }; + + assert_msg_equals( + expected_large_numbers, + common_chat_parse( + "\n" + " \n" + " \n" + " 999999999999999999999999999999999999999999999999999999999999\n" + " \n" + " \n" + "", + /* is_partial= */ false, + {COMMON_CHAT_FORMAT_QWEN3_CODER_XML})); + + printf("✅ All Qwen3-Coder XML error handling and edge case tests passed!\n"); + } + { + // Qwen3-Coder template: ensure grammar builds with union types and unsupported {"not": {}} branches + auto tmpls = read_templates("models/templates/Qwen3-Coder.jinja"); + common_chat_templates_inputs inputs; + inputs.messages = { message_user }; + + common_chat_tool qwen_union_tool { + /* .name = */ "qwen_union", + /* .description = */ "Test tool for union/anyOf handling", + /* .parameters = */ R"({ + "type": "object", + "properties": { + "priority": { "type": ["number", "null"] }, + "maybe_text": { "anyOf": [ { "not": {} }, { "type": "string" } ] }, + "config": { "anyOf": [ { "type": "object" }, { "type": "null" } ] } + }, + "required": [] + })", + }; + inputs.tools = { qwen_union_tool }; + + auto params = common_chat_templates_apply(tmpls.get(), inputs); + assert_equals(COMMON_CHAT_FORMAT_QWEN3_CODER_XML, params.format); + assert_equals(false, params.grammar.empty()); + + // Grammar should compile successfully + auto grammar = build_grammar(params.grammar); + if (!grammar) { + throw std::runtime_error("Failed to build Qwen3-Coder grammar with union types"); + } + } + { auto tmpls = read_templates("models/templates/ibm-granite-granite-3.3-2B-Instruct.jinja"); std::vector end_tokens{ "<|end_of_text|>" }; @@ -2129,6 +2798,7 @@ static void test_template_output_parsers() { } + static void test_msg_diffs_compute() { printf("[%s]\n", __func__); {