Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 36 additions & 2 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,34 @@ function(llama_test target)
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()

function(llama_test_cmd target)
include(CMakeParseArguments)
set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
set(multiValueArgs ARGS)
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

if (NOT DEFINED LLAMA_TEST_LABEL)
set(LLAMA_TEST_LABEL "main")
endif()
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
set(LLAMA_TEST_WORKING_DIRECTORY .)
endif()
if (DEFINED LLAMA_TEST_NAME)
set(TEST_NAME ${LLAMA_TEST_NAME})
else()
set(TEST_NAME ${target})
endif()

add_test(
NAME ${TEST_NAME}
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
COMMAND ${target}
${LLAMA_TEST_ARGS})

set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()

# Builds and runs a test source file.
# Optional args:
# - NAME: name of the executable & test target (defaults to the source file name without extension)
Expand Down Expand Up @@ -97,8 +125,14 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)

# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847
# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf)
if (NOT WIN32)
llama_test_cmd(
${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh
NAME test-tokenizers-ggml-vocabs
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}
ARGS https://huggingface.co/ggml-org/vocabs ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocabs
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In a follow-up PR we can replace all ${CMAKE_CURRENT_SOURCE_DIR}/../models/ with ${CMAKE_PROJECT_SOURCE_DIR}/models/

)
endif()

if (LLAMA_LLGUIDANCE)
llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
Expand Down
36 changes: 36 additions & 0 deletions tests/test-tokenizers-repo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

if [ $# -lt 2 ]; then
printf "Usage: $0 <git-repo> <target-folder> [<test-exe>]\n"
exit 1
fi

if [ $# -eq 3 ]; then
toktest=$3
else
toktest="./test-tokenizer-0"
fi

if [ ! -x $toktest ]; then
printf "Test executable \"$toktest\" not found!\n"
exit 1
fi

repo=$1
folder=$2

if [ -d $folder ] && [ -d $folder/.git ]; then
(cd $folder; git pull)
else
git clone $repo $folder
fi

shopt -s globstar
for gguf in $folder/**/*.gguf; do
if [ -f $gguf.inp ] && [ -f $gguf.out ]; then
$toktest $gguf
else
printf "Found \"$gguf\" without matching inp/out files, ignoring...\n"
fi
done

Loading