ggml-org · CISC · May 28, 2025 · May 28, 2025 · May 28, 2025 · May 28, 2025
@@ -244,7 +244,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 }
 
 // download one single file from remote URL to local path
-static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
+bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
     // Check if the file already exists locally
     auto file_exists = std::filesystem::exists(path);
 
@@ -467,7 +467,7 @@ static bool common_download_file_single(const std::string & url, const std::stri
 
 // download multiple files from remote URLs to local paths
 // the input is a vector of pairs <url, path>
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
+bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
     // Prepare download in parallel
     std::vector<std::future<bool>> futures_download;
     for (auto const & item : urls) {
@@ -711,12 +711,12 @@ bool common_has_curl() {
     return false;
 }
 
-static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
+bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from internet\n");
     return false;
 }
 
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
+bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return false;
 }

@@ -87,3 +87,10 @@ struct common_remote_params {
 };
 // get remote file content, returns <http_code, raw_response_body>
 std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
+
+// download one single file from remote URL to local path
+bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline);
+
+// download multiple files from remote URLs to local paths
+// the input is a vector of pairs <url, path>
+bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline);
@@ -97,8 +97,9 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
-# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
-# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
+if (LLAMA_CURL AND NOT WIN32)
+    llama_build_and_test(test-tokenizers-remote.cpp WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
+endif()
 
 if (LLAMA_LLGUIDANCE)
     llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)

@@ -0,0 +1,164 @@
+#include "arg.h"
+#include "common.h"
+
+#include <string>
+#include <fstream>
+#include <vector>
+
+#include <nlohmann/json.hpp>
+
+using json = nlohmann::json;
+
+#undef NDEBUG
+#include <cassert>
+
+std::string endpoint = "https://huggingface.co/";
+std::string repo = "ggml-org/vocabs";
+
+static void write_file(const std::string & fname, const std::string & content) {
+    std::ofstream file(fname);
+    if (file) {
+        file << content;
+        file.close();
+    }
+}
+
+static json get_hf_repo_dir(const std::string & hf_repo_with_branch, bool recursive, const std::string & repo_path, const std::string & bearer_token) {
+    auto parts = string_split<std::string>(hf_repo_with_branch, ':');
+    std::string branch = parts.size() > 1 ? parts.back() : "main";
+    std::string hf_repo = parts[0];
+    std::string url = endpoint + "api/models/" + hf_repo + "/tree/" + branch;
+    std::string path = repo_path;
+
+    if (!path.empty()) {
+        // FIXME: path should be properly url-encoded!
+        string_replace_all(path, "/", "%2F");
+        url += "/" + path;
+    }
+
+    if (recursive) {
+        url += "?recursive=true";
+    }
+
+    // headers
+    std::vector<std::string> headers;
+    headers.push_back("Accept: application/json");
+    if (!bearer_token.empty()) {
+        headers.push_back("Authorization: Bearer " + bearer_token);
+    }
+
+    // we use "=" to avoid clashing with other component, while still being allowed on windows
+    std::string cached_response_fname = "test_vocab=" + hf_repo + "/" + repo_path + "=" + branch + ".json";
+    string_replace_all(cached_response_fname, "/", "_");
+    std::string cached_response_path = fs_get_cache_file(cached_response_fname);
+
+    // make the request
+    common_remote_params params;
+    params.headers = headers;
+    json res_data;
+    try {
+        // TODO: For pagination links we need response headers, which is not provided by common_remote_get_content()
+        auto res = common_remote_get_content(url, params);
+        long res_code = res.first;
+        std::string res_str = std::string(res.second.data(), res.second.size());
+
+        if (res_code == 200) {
+            write_file(cached_response_path, res_str);
+        } else if (res_code == 401) {
+            throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+        } else {
+            throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+        }
+    } catch (const std::exception & e) {
+        fprintf(stderr, "error: failed to get repo tree: %s\n", e.what());
+        fprintf(stderr, "try reading from cache\n");
+    }
+
+    // try to read from cache
+    try {
+        std::ifstream f(cached_response_path);
+        res_data = json::parse(f);
+    } catch (const std::exception & e) {
+        fprintf(stderr, "error: failed to get repo tree (check your internet connection)\n");
+    }
+
+    return res_data;
+}
+
+int main(void) {
+    if (common_has_curl()) {
+        json tree = get_hf_repo_dir(repo, true, {}, {});
+
+        if (!tree.empty()) {
+            std::vector<std::pair<std::string, std::string>> files;
+
+            for (const auto & item : tree) {
+                if (item.at("type") == "file") {
+                    std::string path = item.at("path");
+
+                    if (string_ends_with(path, ".gguf") || string_ends_with(path, ".gguf.inp") || string_ends_with(path, ".gguf.out")) {
+                        // this is to avoid different repo having same file name, or same file name in different subdirs
+                        std::string filepath = repo + "_" + path;
+                        // to make sure we don't have any slashes in the filename
+                        string_replace_all(filepath, "/", "_");
+                        // to make sure we don't have any quotes in the filename
+                        string_replace_all(filepath, "'", "_");
+                        filepath = fs_get_cache_file(filepath);
+
+                        files.push_back({endpoint + repo + "/resolve/main/" + path, filepath});
+                    }
+                }
+            }
+
+            if (!files.empty()) {
+                bool downloaded = false;
+                const size_t batch_size = 6;
+                size_t batches = (files.size() + batch_size - 1) / batch_size;
+
+                for (size_t i = 0; i < batches; i++) {
+                    size_t batch_pos = (i * batch_size);
+                    size_t batch_step = batch_pos + batch_size;
+                    auto batch_begin = files.begin() + batch_pos;
+                    auto batch_end = batch_step >= files.size() ? files.end() : files.begin() + batch_step;
+                    std::vector<std::pair<std::string, std::string>> batch(batch_begin, batch_end);
+
+                    if (!(downloaded = common_download_file_multiple(batch, {}, false))) {
+                        break;
+                    }
+                }
+
+                if (downloaded) {
+                    std::string dir_sep(1, DIRECTORY_SEPARATOR);
+
+                    for (auto const & item : files) {
+                        std::string filepath = item.second;
+
+                        if (string_ends_with(filepath, ".gguf")) {
+                            std::string vocab_inp = filepath + ".inp";
+                            std::string vocab_out = filepath + ".out";
+                            auto matching_inp = std::find_if(files.begin(), files.end(), [&vocab_inp](const auto & p) {
+                                return p.second == vocab_inp;
+                            });
+                            auto matching_out = std::find_if(files.begin(), files.end(), [&vocab_out](const auto & p) {
+                                return p.second == vocab_out;
+                            });
+
+                            if (matching_inp != files.end() && matching_out != files.end()) {
+                                std::string test_command = "." + dir_sep + "test-tokenizer-0 '" + filepath + "'";
+                                assert(std::system(test_command.c_str()) == 0);
+                            } else {
+                                printf("test-tokenizers-remote: %s found without .inp/out vocab files, skipping...\n", filepath.c_str());
+                            }
+                        }
+                    }
+                } else {
+                    printf("test-tokenizers-remote: failed to download files, unable to perform tests...\n");
+                }
+            }
+        } else {
+            printf("test-tokenizers-remote: failed to retrieve repository info, unable to perform tests...\n");
+        }
+    } else {
+        printf("test-tokenizers-remote: no curl, unable to perform tests...\n");
+    }
+}