Skip to content
Closed
8 changes: 4 additions & 4 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
}

// download one single file from remote URL to local path
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
// Check if the file already exists locally
auto file_exists = std::filesystem::exists(path);

Expand Down Expand Up @@ -467,7 +467,7 @@ static bool common_download_file_single(const std::string & url, const std::stri

// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
// Prepare download in parallel
std::vector<std::future<bool>> futures_download;
for (auto const & item : urls) {
Expand Down Expand Up @@ -711,12 +711,12 @@ bool common_has_curl() {
return false;
}

static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from internet\n");
return false;
}

static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
return false;
}
Expand Down
7 changes: 7 additions & 0 deletions common/arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,10 @@ struct common_remote_params {
};
// get remote file content, returns <http_code, raw_response_body>
std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);

// download one single file from remote URL to local path
bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we don't need to expose these functions. instead, use common_remote_get_content, then write the response content to file using fstream

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe, but then I'll lose all the fancy functionality (caching, multi-threaded download, etc).


// download multiple files from remote URLs to local paths
// the input is a vector of pairs <url, path>
bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline);
5 changes: 3 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,9 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)

# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
if (LLAMA_CURL AND NOT WIN32)
llama_build_and_test(test-tokenizers-remote.cpp WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
endif()

if (LLAMA_LLGUIDANCE)
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
Expand Down
164 changes: 164 additions & 0 deletions tests/test-tokenizers-remote.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
#include "arg.h"
#include "common.h"

#include <string>
#include <fstream>
#include <vector>

#include <nlohmann/json.hpp>

using json = nlohmann::json;

#undef NDEBUG
#include <cassert>

std::string endpoint = "https://huggingface.co/";
std::string repo = "ggml-org/vocabs";

static void write_file(const std::string & fname, const std::string & content) {
std::ofstream file(fname);
if (file) {
file << content;
file.close();
}
}

static json get_hf_repo_dir(const std::string & hf_repo_with_branch, bool recursive, const std::string & repo_path, const std::string & bearer_token) {
auto parts = string_split<std::string>(hf_repo_with_branch, ':');
std::string branch = parts.size() > 1 ? parts.back() : "main";
std::string hf_repo = parts[0];
std::string url = endpoint + "api/models/" + hf_repo + "/tree/" + branch;
std::string path = repo_path;

if (!path.empty()) {
// FIXME: path should be properly url-encoded!
string_replace_all(path, "/", "%2F");
url += "/" + path;
}

if (recursive) {
url += "?recursive=true";
}

// headers
std::vector<std::string> headers;
headers.push_back("Accept: application/json");
if (!bearer_token.empty()) {
headers.push_back("Authorization: Bearer " + bearer_token);
}

// we use "=" to avoid clashing with other component, while still being allowed on windows
std::string cached_response_fname = "test_vocab=" + hf_repo + "/" + repo_path + "=" + branch + ".json";
string_replace_all(cached_response_fname, "/", "_");
std::string cached_response_path = fs_get_cache_file(cached_response_fname);

// make the request
common_remote_params params;
params.headers = headers;
json res_data;
try {
// TODO: For pagination links we need response headers, which is not provided by common_remote_get_content()
auto res = common_remote_get_content(url, params);
long res_code = res.first;
std::string res_str = std::string(res.second.data(), res.second.size());

if (res_code == 200) {
write_file(cached_response_path, res_str);
} else if (res_code == 401) {
throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
} else {
throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
}
} catch (const std::exception & e) {
fprintf(stderr, "error: failed to get repo tree: %s\n", e.what());
fprintf(stderr, "try reading from cache\n");
}

// try to read from cache
try {
std::ifstream f(cached_response_path);
res_data = json::parse(f);
} catch (const std::exception & e) {
fprintf(stderr, "error: failed to get repo tree (check your internet connection)\n");
}

return res_data;
}

int main(void) {
if (common_has_curl()) {
json tree = get_hf_repo_dir(repo, true, {}, {});

if (!tree.empty()) {
std::vector<std::pair<std::string, std::string>> files;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Btw, avoid these nested STL containers - I find this very difficult to understand. This can be:

struct common_file_info {
    std::string whatever_the_first_string_is;
    std::string whatever_the_second_string_is;
    // etc ...
};

std::vector<common_file_info> files;

It's much easier to read and extend in the future with additional information if needed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe refactor common_download_file_multiple separately first then?

Copy link
Collaborator

@ngxson ngxson Jun 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the problem is that you're trying to reuse common_download_file_multiple which is intended to be an internal function inside arg.cpp. But I still don't think we need to expose it to the outside world.

Tbh, I think this test is being quite over-engineered. I adhere to the KISS principle and here is my thoughts:

  • Caching is not necessary, as we don't have CI setup to use cache anyway
  • On local, if you want to run it, you can simply setup a scripts/get-*.sh. We already had some scripts like this
  • I don't see why we need to filter files by extension and download it manually via curl. Just download all files via git clone https://huggingface.co/ggml-org/vocabs. And that's even better, git already handle caching stuff
  • While we cannot run git clone on windows CI, we don't even need it. The tokenizer logic is guaranteed to be deterministic cross-platforms, we only need to run it on one of the linux CI job. Edit: windows runners on github CI have git pre-installed

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the problem is that you're trying to reuse common_download_file_multiple which is intended to be an internal function inside arg.cpp. But I still don't think we need to expose it to the outside world.

Well, maybe adding some new functions, as @ggerganov suggested, would be a better idea?

  • Getting repo file info
  • Downloading repo files by name/extension
  • Adding batch functionality to common_download_file_multiple

Tbh, I think this test is being quite over-engineered. I adhere to the KISS principle and here is my thoughts:

Undeniably. :)

* Caching is not necessary, as we don't have CI setup to use cache anyway

True.

* On local, if you want to run it, you can simply setup a `scripts/get-*.sh`. We already had some scripts like this

Should not be any reason to run this locally though, this is meant for CI.

* I don't see why we need to filter files by extension and download it manually via curl. Just download all files via `git clone https://huggingface.co/ggml-org/vocabs`. And that's even better, `git` already handle caching stuff

I don't want to rely on any specific tree structure, so in this case I would have to traverse the checkout to find all the right files instead, which adds just as much logic as before.

* ~While we cannot run `git clone` on windows CI, we don't even need it. The tokenizer logic is guaranteed to be deterministic cross-platforms, we only need to run it on **one of** the linux CI job.~ Edit: windows runners on github CI have git pre-installed

Yeah, I guess.

Copy link
Collaborator

@ngxson ngxson Jun 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to rely on any specific tree structure, so in this case I would have to traverse the checkout to find all the right files instead, which adds just as much logic as before.

Just add a script to copy all *.gguf.* to a temp directory before running it, something like this:

find . -type f -name "*.gguf.*" -exec cp {} ./my_tmp_dir
test-tokenizers-all ./my_tmp_dir

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding batching to common_download_file_multiple would be useful though?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. That function should be kept simple. Batching is just a wrapper around it

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thinking was that it would be less duplicated effort, and that model downloading would benefit from it as it will now most likely get throttled on many splits.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't want to spend too much of my time arguing which way is better, but if you want to do it - do it.

Still, my concerns about whether the whole thing can be just a bash script seem to be ignored at this point.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, didn't mean to come off as ignoring it, just mulling it over. :)


for (const auto & item : tree) {
if (item.at("type") == "file") {
std::string path = item.at("path");

if (string_ends_with(path, ".gguf") || string_ends_with(path, ".gguf.inp") || string_ends_with(path, ".gguf.out")) {
// this is to avoid different repo having same file name, or same file name in different subdirs
std::string filepath = repo + "_" + path;
// to make sure we don't have any slashes in the filename
string_replace_all(filepath, "/", "_");
// to make sure we don't have any quotes in the filename
string_replace_all(filepath, "'", "_");
filepath = fs_get_cache_file(filepath);

files.push_back({endpoint + repo + "/resolve/main/" + path, filepath});
}
}
}
Comment on lines +90 to +111
Copy link
Member

@ggerganov ggerganov Jun 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should factor this in a libcommon function. Two main reasons:

  • Be able to reuse this
  • Contain the json.hpp within libcommon and avoid compiling it one more time just for this test


if (!files.empty()) {
bool downloaded = false;
const size_t batch_size = 6;
size_t batches = (files.size() + batch_size - 1) / batch_size;

for (size_t i = 0; i < batches; i++) {
size_t batch_pos = (i * batch_size);
size_t batch_step = batch_pos + batch_size;
auto batch_begin = files.begin() + batch_pos;
auto batch_end = batch_step >= files.size() ? files.end() : files.begin() + batch_step;
std::vector<std::pair<std::string, std::string>> batch(batch_begin, batch_end);

if (!(downloaded = common_download_file_multiple(batch, {}, false))) {
break;
}
}

if (downloaded) {
std::string dir_sep(1, DIRECTORY_SEPARATOR);

for (auto const & item : files) {
std::string filepath = item.second;

if (string_ends_with(filepath, ".gguf")) {
std::string vocab_inp = filepath + ".inp";
std::string vocab_out = filepath + ".out";
auto matching_inp = std::find_if(files.begin(), files.end(), [&vocab_inp](const auto & p) {
return p.second == vocab_inp;
});
auto matching_out = std::find_if(files.begin(), files.end(), [&vocab_out](const auto & p) {
return p.second == vocab_out;
});

if (matching_inp != files.end() && matching_out != files.end()) {
std::string test_command = "." + dir_sep + "test-tokenizer-0 '" + filepath + "'";
assert(std::system(test_command.c_str()) == 0);
} else {
printf("test-tokenizers-remote: %s found without .inp/out vocab files, skipping...\n", filepath.c_str());
}
}
}
} else {
printf("test-tokenizers-remote: failed to download files, unable to perform tests...\n");
}
}
} else {
printf("test-tokenizers-remote: failed to retrieve repository info, unable to perform tests...\n");
}
} else {
printf("test-tokenizers-remote: no curl, unable to perform tests...\n");
}
}
Loading