diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 81997fd942e85..70a859b630a7b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -62,16 +62,6 @@ env: # LLAMA_LOG_TIMESTAMPS: 1 jobs: - - # TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know - # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124 - - # TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know - # how to debug it. - # ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584 - # would be great if we fix these - # CUDA Release ubuntu-latest-cmake: @@ -120,7 +110,7 @@ jobs: export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON + cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_CUDA=ON -DLLAMA_CURL=ON -DGGML_RPC=ON cmake --build . --config Release -j $(nproc) - name: Determine tag name @@ -202,7 +192,7 @@ jobs: export LD_LIBRARY_PATH=/usr/local/cuda-12.6/lib64 ${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=ON + cmake .. -DLLAMA_FATAL_WARNINGS=OFF -DBUILD_SHARED_LIBS=ON -DGGML_VULKAN=ON -DLLAMA_CURL=ON -DGGML_RPC=ON cmake --build . --config Release -j $(nproc) - name: Determine tag name @@ -241,6 +231,7 @@ jobs: release: permissions: write-all + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} runs-on: ubuntu-latest diff --git a/Makefile b/Makefile index b9131eae549f5..dfa32d51656d3 100644 --- a/Makefile +++ b/Makefile @@ -878,6 +878,10 @@ ifdef GGML_METAL MK_CPPFLAGS += -DGGML_USE_METAL MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit OBJ_GGML += ggml/src/ggml-metal.o + +ifdef GGML_METAL_USE_BF16 + MK_CPPFLAGS += -DGGML_METAL_USE_BF16 +endif # GGML_METAL_USE_BF16 ifdef GGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_METAL_NDEBUG endif diff --git a/Package.swift b/Package.swift index d3661d13c80f2..0f4f190180654 100644 --- a/Package.swift +++ b/Package.swift @@ -61,13 +61,15 @@ let package = Package( name: "llama", path: ".", exclude: [ + "build", "cmake", "examples", "scripts", "models", "tests", "CMakeLists.txt", - "Makefile" + "Makefile", + "ggml/src/ggml-metal-embed.metal" ], sources: sources, resources: resources, diff --git a/ci/run.sh b/ci/run.sh index 21b62dd1ef733..20610e56009be 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -39,7 +39,7 @@ SRC=`pwd` CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON" if [ ! -z ${GG_BUILD_METAL} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON" + CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON" fi if [ ! -z ${GG_BUILD_CUDA} ]; then diff --git a/examples/chat-persistent.sh b/examples/chat-persistent.sh index d9cab9836482e..9d761ebb843af 100755 --- a/examples/chat-persistent.sh +++ b/examples/chat-persistent.sh @@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin" NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt" NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin" -SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+' -SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' +SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\ +'|'\ +'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+' SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d" CTX_SIZE=2048 @@ -129,15 +130,12 @@ while read -e line; do printf ' ' - # HACK get num tokens from debug message - # TODO get both messages in one go - if ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" || - ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then + if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then echo >&2 "Couldn't get number of tokens from ./llama-cli output!" exit 1 fi - n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg"))) + n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")") if ((n_tokens > CTX_ROTATE_POINT)); then tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE" diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index e7873a143fe05..1eddfd0db376a 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -256,6 +256,9 @@ static ggml_type ggml_type_from_name(const std::string & s) { if (s == "f16") { return GGML_TYPE_F16; } + if (s == "bf16") { + return GGML_TYPE_BF16; + } if (s == "q8_0") { return GGML_TYPE_Q8_0; } diff --git a/examples/server/public/index.html b/examples/server/public/index.html index bf1d1b7940e8a..55639a9448e71 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -200,23 +200,38 @@