diff --git a/.clang-tidy b/.clang-tidy index 952c0cca82580..310c3d182c8f2 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -17,8 +17,10 @@ Checks: > -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, portability-*, + -portability-simd-intrinsics, misc-*, -misc-const-correctness, -misc-non-private-member-variables-in-classes, -misc-no-recursion, + -misc-use-anonymous-namespace, FormatStyle: none diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile index 575e81b486d13..3193fea1e9ae5 100644 --- a/.devops/full-musa.Dockerfile +++ b/.devops/full-musa.Dockerfile @@ -6,6 +6,9 @@ ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_V FROM ${BASE_MUSA_DEV_CONTAINER} AS build +# MUSA architecture to build for (defaults to all supported archs) +ARG MUSA_DOCKER_ARCH=default + RUN apt-get update && \ apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 @@ -19,7 +22,11 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ +# Use the default MUSA archs if not specified +RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release -j$(nproc) && \ cp build/bin/* . diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile index 3372749bee70e..e7c75af20e265 100644 --- a/.devops/llama-cli-musa.Dockerfile +++ b/.devops/llama-cli-musa.Dockerfile @@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU FROM ${BASE_MUSA_DEV_CONTAINER} AS build +# MUSA architecture to build for (defaults to all supported archs) +ARG MUSA_DOCKER_ARCH=default + RUN apt-get update && \ apt-get install -y build-essential git cmake @@ -15,7 +18,11 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ +# Use the default MUSA archs if not specified +RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release --target llama-cli -j$(nproc) && \ mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile index eb67201c18775..cebe51d42fa95 100644 --- a/.devops/llama-server-musa.Dockerfile +++ b/.devops/llama-server-musa.Dockerfile @@ -8,6 +8,9 @@ ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU FROM ${BASE_MUSA_DEV_CONTAINER} AS build +# MUSA architecture to build for (defaults to all supported archs) +ARG MUSA_DOCKER_ARCH=default + RUN apt-get update && \ apt-get install -y build-essential git cmake libcurl4-openssl-dev @@ -15,7 +18,11 @@ WORKDIR /app COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ +# Use the default MUSA archs if not specified +RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ cmake --build build --config Release --target llama-server -j$(nproc) && \ mkdir -p /app/lib && \ find build -name "*.so" -exec cp {} /app/lib \; diff --git a/.devops/nix/python-scripts.nix b/.devops/nix/python-scripts.nix index 392e9ffe41bf5..56ea182788764 100644 --- a/.devops/nix/python-scripts.nix +++ b/.devops/nix/python-scripts.nix @@ -34,7 +34,7 @@ let # server tests openai - behave + pytest prometheus-client ]; in diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml index 550ee1b498ab9..f10b3a2b22648 100644 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -24,7 +24,8 @@ body: - type: dropdown id: operating-system attributes: - label: Which operating systems do you know to be affected? + label: Operating systems + description: Which operating systems do you know to be affected? multiple: true options: - Linux @@ -41,14 +42,17 @@ body: description: Which GGML backends do you know to be affected? options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] multiple: true + validations: + required: true - type: textarea - id: steps_to_reproduce + id: info attributes: - label: Steps to Reproduce + label: Problem description & steps to reproduce description: > - Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + Please give us a summary of the problem and tell us how to reproduce it. If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us. placeholder: > + I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY. Here are the exact commands that I used: ... validations: required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml index 1adb162b79a77..1ccef0793d45e 100644 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -26,7 +26,8 @@ body: - type: dropdown id: operating-system attributes: - label: Which operating systems do you know to be affected? + label: Operating systems + description: Which operating systems do you know to be affected? multiple: true options: - Linux @@ -43,6 +44,8 @@ body: description: Which GGML backends do you know to be affected? options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan] multiple: true + validations: + required: true - type: textarea id: hardware attributes: @@ -55,20 +58,20 @@ body: - type: textarea id: model attributes: - label: Model + label: Models description: > - Which model at which quantization were you using when encountering the bug? + Which model(s) at which quantization were you using when encountering the bug? If you downloaded a GGUF file off of Huggingface, please provide a link. placeholder: > e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M validations: required: false - type: textarea - id: steps_to_reproduce + id: info attributes: - label: Steps to Reproduce + label: Problem description & steps to reproduce description: > - Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + Please give us a summary of the problem and tell us how to reproduce it. If you can narrow down the bug to specific hardware, compile flags, or command line arguments, that information would be very much appreciated by us. placeholder: > diff --git a/.github/ISSUE_TEMPLATE/019-bug-misc.yml b/.github/ISSUE_TEMPLATE/019-bug-misc.yml index 124cdee919089..d157ea307e50f 100644 --- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml +++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml @@ -14,7 +14,7 @@ body: id: version attributes: label: Name and Version - description: Which version of our software are you running? (use `--version` to get a version string) + description: Which version of our software is affected? (You can use `--version` to get a version string.) placeholder: | $./llama-cli --version version: 2999 (42b4109e) @@ -24,7 +24,8 @@ body: - type: dropdown id: operating-system attributes: - label: Which operating systems do you know to be affected? + label: Operating systems + description: Which operating systems do you know to be affected? multiple: true options: - Linux @@ -33,28 +34,30 @@ body: - BSD - Other? (Please let us know in description) validations: - required: true + required: false - type: dropdown id: module attributes: label: Which llama.cpp modules do you know to be affected? multiple: true options: + - Documentation/Github - libllama (core library) - llama-cli - llama-server - llama-bench - llama-quantize - Python/Bash scripts + - Test code - Other (Please specify in the next section) validations: - required: true + required: false - type: textarea - id: steps_to_reproduce + id: info attributes: - label: Steps to Reproduce + label: Problem description & steps to reproduce description: > - Please tell us how to reproduce the bug and any additional information that you think could be useful for fixing it. + Please give us a summary of the problem and tell us how to reproduce it (if applicable). validations: required: true - type: textarea @@ -62,7 +65,7 @@ body: attributes: label: First Bad Commit description: > - If the bug was not present on an earlier version: when did it start appearing? + If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing? If possible, please do a git bisect and identify the exact commit that introduced the bug. validations: required: false @@ -71,8 +74,8 @@ body: attributes: label: Relevant log output description: > - Please copy and paste any relevant log output, including the command that you entered and any generated text. + If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text. This will be automatically formatted into code, so no need for backticks. render: shell validations: - required: true + required: false diff --git a/.github/labeler.yml b/.github/labeler.yml index 89436740d1ffb..1b47bc96885c4 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -3,19 +3,18 @@ Kompute: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-kompute.h - - ggml/src/ggml-kompute.cpp + - ggml/src/ggml-kompute/** - README-kompute.md Apple Metal: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-metal.h - - ggml/src/ggml-metal.cpp + - ggml/src/ggml-metal/** - README-metal.md SYCL: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-sycl.h - - ggml/src/ggml-sycl.cpp - ggml/src/ggml-sycl/** - docs/backend/SYCL.md - examples/sycl/** @@ -27,8 +26,8 @@ Nvidia GPU: Vulkan: - changed-files: - any-glob-to-any-file: - - ggml/ggml_vk_generate_shaders.py - - ggml/src/ggml-vulkan* + - ggml/include/ggml-vulkan.h + - ggml/src/ggml-vulkan/** documentation: - changed-files: - any-glob-to-any-file: @@ -75,11 +74,7 @@ server: ggml: - changed-files: - any-glob-to-any-file: - - ggml/include/ggml*.h - - ggml/src/ggml*.c - - ggml/src/ggml*.cpp - - ggml/src/ggml*.h - - ggml-cuda/** + - ggml/** nix: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index efd6726d08a50..a6d35816523b7 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -721,7 +721,7 @@ jobs: cmake --build build --config ${{ matrix.build }} -j $(nproc) windows-latest-cmake: - runs-on: windows-2019 + runs-on: windows-latest env: OPENBLAS_VERSION: 0.3.23 @@ -864,12 +864,33 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip name: llama-bin-win-${{ matrix.build }}.zip - windows-latest-cmake-cuda: + ubuntu-latest-cmake-cuda: + runs-on: ubuntu-latest + container: nvidia/cuda:12.6.2-devel-ubuntu24.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + + - name: Install dependencies + env: + DEBIAN_FRONTEND: noninteractive + run: | + apt update + apt install -y cmake build-essential ninja-build libgomp1 git + + - name: Build with CMake + run: | + cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON + cmake --build build + + windows-2019-cmake-cuda: runs-on: windows-2019 strategy: matrix: - cuda: ['12.2.0', '11.7.1'] + cuda: ['12.4', '11.7'] build: ['cuda'] steps: @@ -877,24 +898,83 @@ jobs: id: checkout uses: actions/checkout@v4 with: - fetch-depth: 0 - - - name: Install CUDA toolkit - id: cuda-toolkit - uses: Jimver/cuda-toolkit@v0.2.15 + fetch-depth: 0 + + - name: Install Cuda Toolkit 11.7 + if: ${{ matrix.cuda == '11.7' }} + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-11.7.99-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-11.7.99-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-11.7.99-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-11.7.4.6-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-11.7.91-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-11.7.91-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-11.7.101-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-11.7.91-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cudart-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvcc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvrtc-windows-x86_64-11.7.99-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libcublas-windows-x86_64-11.7.4.6-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvtx-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\visual_studio_integration-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_nvprof-windows-x86_64-11.7.101-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\cuda_cccl-windows-x86_64-11.7.91-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" /E /I /H /Y + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V11_7=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + + - name: Install Cuda Toolkit 12.4 + if: ${{ matrix.cuda == '12.4' }} + run: | + mkdir -p "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + choco install unzip -y + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-12.4.131-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/libcublas/windows-x86_64/libcublas-windows-x86_64-12.4.5.8-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-12.4.127-archive.zip" + curl -O "https://developer.download.nvidia.com/compute/cuda/redist/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-12.4.127-archive.zip" + unzip '*.zip' -d "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cudart-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvcc-windows-x86_64-12.4.131-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvrtc-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libcublas-windows-x86_64-12.4.5.8-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvtx-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_profiler_api-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\visual_studio_integration-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_nvprof-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + xcopy "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\cuda_cccl-windows-x86_64-12.4.127-archive\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" /E /I /H /Y + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + echo "CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + echo "CUDA_PATH_V12_4=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8 + + - name: Install ccache + uses: hendrikmuhs/ccache-action@v1.2 with: - cuda: ${{ matrix.cuda }} - method: 'network' - sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' + key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }} + + - name: Install Ninja + id: install_ninja + run: | + choco install ninja - name: Build id: cmake_build + shell: cmd run: | - mkdir build - cd build - cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON - cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml - cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} + call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON -DGGML_RPC=ON + set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1 + cmake --build build --config Release -j %NINJA_JOBS% -t ggml + cmake --build build --config Release - name: Determine tag name id: tag @@ -923,10 +1003,12 @@ jobs: name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip - name: Copy and pack Cuda runtime + if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} run: | - echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}" + echo "Cuda install location: ${{ env.CUDA_PATH }}" $dst='.\build\bin\cudart\' - robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll + robocopy "${{env.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll + robocopy "${{env.CUDA_PATH}}\lib" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll 7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\* - name: Upload Cuda runtime @@ -977,7 +1059,7 @@ jobs: - name: Build the release package id: pack_artifacts - if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} run: | echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" @@ -1002,7 +1084,7 @@ jobs: 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* - name: Upload the release package - if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }} + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} uses: actions/upload-artifact@v4 with: path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip @@ -1032,6 +1114,11 @@ jobs: run: | & 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version + - name: Install ccache + uses: hendrikmuhs/ccache-action@v1.2 + with: + key: ${{ github.job }} + - name: Build id: cmake_build run: | @@ -1052,6 +1139,8 @@ jobs: - name: Clone id: checkout uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Install id: depends @@ -1166,7 +1255,7 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake - - windows-latest-cmake-cuda + - windows-2019-cmake-cuda - windows-latest-cmake-hip-release - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 diff --git a/AUTHORS b/AUTHORS index 1bd36158a72f4..2eb60806ad058 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,4 +1,4 @@ -# date: Wed Jun 26 19:36:34 EEST 2024 +# date: Thu Nov 28 20:46:15 EET 2024 # this file is auto-generated by scripts/gen-authors.sh 0cc4m @@ -7,6 +7,7 @@ 2f38b454 3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> 44670 <44670@users.noreply.github.com> +65a <10104049+65a@users.noreply.github.com> AN Long AT Aarni Koskela @@ -19,20 +20,28 @@ Adithya Balaji AdithyanI Adrian Adrian Hesketh +Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com> Ahmet Zeer AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> +AidanBeltonS Aisuko +Akarshan Biswas Akarshan Biswas +Al Mochkin <14274697+amochkin@users.noreply.github.com> Albert Jin Alberto <57916483+albbus-stack@users.noreply.github.com> +Alberto Cabrera Pérez +Alberto Cabrera Pérez Alex Alex Azarov Alex Azarov Alex Klinkhamer Alex Klinkhamer Alex Nguyen +Alex O'Connell <35843486+acon96@users.noreply.github.com> Alex Petenchea Alex Renda +Alex Tuddenham <61622354+AlexsCode@users.noreply.github.com> Alex von Gluck IV Alexey Parfenov Ali Chraghi <63465728+alichraghi@users.noreply.github.com> @@ -45,18 +54,25 @@ AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> Ananta Bastola Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> András Salamon +Andreas (Andi) Kunar Andrei Andrew Canis Andrew Downing Andrew Duffy Andrew Godfrey +Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com> +Andy Salerno Andy Tai +Anthony Van de Gejuchte +Antonis Makropoulos Arik Poznanski +Armen Kaleshian Artem Artem Zinnatullin Artyom Lebedev Asbjørn Olling Ásgeir Bjarni Ingvarsson +Asghar Ghorbani Ashish <1856117+ashishdatta@users.noreply.github.com> Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Ashraful Islam @@ -76,12 +92,16 @@ Ben Williams Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com> Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> Bernat Vadell +Bert Wagner Bingan <70050083+binganao@users.noreply.github.com> +Bjarke Viksøe <164612031+bviksoe@users.noreply.github.com> Bodo Graumann Bono Lv Borislav Stanimirov Branden Butler +Brandon Squizzato <35474886+bsquizz@users.noreply.github.com> Brian +Brian Cunnie Bruce MacDonald Bryan Honof CJ Pais @@ -90,32 +110,47 @@ Calvin Laurenson Cameron Cameron Kaiser Carolinabanana <140120812+Carolinabanana@users.noreply.github.com> +CarryFun <76023481+CarryFun@users.noreply.github.com> +Carsten Kragelund Jørgensen +CarterLi999 <664681047@qq.com> Casey Primozic Casey Primozic CausalLM <148736309+CausalLM@users.noreply.github.com> Cebtenzzre Chad Brewbaker +Changyeon Kim Chao Jiang +Charles Xu <63788048+chaxu01@users.noreply.github.com> +Charles Xu +Chen Xi +Chen Xi Cheng Shao +Chenguang Li <87689256+noemotiovon@users.noreply.github.com> Chris Elrod Chris Kuehl Christian Demsar Christian Demsar Christian Falch <875252+chrfalch@users.noreply.github.com> Christian Kögler +Christian Köhnenkamp Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com> Clark Saben <76020733+csaben@users.noreply.github.com> Clint Herron +Conrad Kramer CrispStrobe <154636388+CrispStrobe@users.noreply.github.com> +Csaba Kecskemeti Cuong Trinh Manh DAN™ Damian Stewart +Dan Johansson <164997844+eddnjjn@users.noreply.github.com> +Dan Johansson Dane Madsen DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> Daniel Bevenius Daniel Drake Daniel Hiltgen Daniel Illescas Romero +Daniel Kleine <53251018+d-kleine@users.noreply.github.com> Daniele <57776841+daniandtheweb@users.noreply.github.com> DannyDaemonic Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> @@ -129,19 +164,28 @@ David Pflug David Renshaw David Sommers <12738+databyte@users.noreply.github.com> David Yang +DavidKorczynski Dawid Potocki Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> Dean Deins +Denis Spasyuk <34203011+dspasyuk@users.noreply.github.com> +Derrick T. Woolworth Deven Mistry <31466137+deven367@users.noreply.github.com> +Dibakar Gope Didzis Gosko +Diego Devesa +Diogo Teles Sant'Anna Djip007 Don Mahurin DooWoong Lee (David) Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> +Dou Xinpeng <15529241576@163.com> +Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com> Douglas Hanley Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> Ebey Abraham +Echo Nolan Ed Lee Ed Lepedus Eddie-Wang @@ -151,10 +195,13 @@ Elbios <141279586+Elbios@users.noreply.github.com> Elton Kola Engininja2 <139037756+Engininja2@users.noreply.github.com> Equim +Eric Curtin +Eric Curtin Eric Sommerlade Eric Zhang <34133756+EZForever@users.noreply.github.com> Erik Garrison Erik Scholz +Esko Toivonen Ettore Di Giacinto Evan Jones Evan Miller @@ -166,19 +213,26 @@ FK Fabian Fabio R. Sluzala Faez Shakil +Faisal Zaghloul +Faisal Zaghloul +Fan Shupei FantasyGmm <16450052+FantasyGmm@users.noreply.github.com> +Farbod Bijary <110523279+farbodbj@users.noreply.github.com> Fattire <528174+fat-tire@users.noreply.github.com> Felix Finn Voorhees Firat +FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com> Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> Francisco Melo <43780565+francis2tm@users.noreply.github.com> Frank Mai FrankHB +Frankie Robertson Fred Douglas <43351173+fredlas@users.noreply.github.com> Frederik Vogel Gabe Goodhart +Gabe Goodhart GainLee Galunid Gary Linscott @@ -187,11 +241,13 @@ Gavin Zhao Genkagaku.GPT Georgi Gerganov Gilad S +Gilad S. <7817232+giladgd@users.noreply.github.com> Giuseppe Scrivano GiviMAD Govlzkoy Guillaume "Vermeille" Sanchez Guillaume Wenzek +Guoliang Hua <32868157+nbcsm@users.noreply.github.com> Guoteng <32697156+SolenoidWGT@users.noreply.github.com> Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> Haggai Nuchi @@ -213,11 +269,14 @@ Hong Bo PENG Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> Howard Su Hua Jiang +Huang Qi Huawei Lin Hugo Roussel +Huifeng Ou <79071290+ho2103@users.noreply.github.com> Ian Bull Ian Bull Ian Scrivener +Icecream95 Ido S IgnacioFDM Igor Okulist @@ -226,11 +285,15 @@ Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> Ionoclast Laboratories Isaac McFadyen IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> +Ivan +Ivan Filipov <159561759+vanaka11@users.noreply.github.com> Ivan Komarov Ivan Stepanov JH23X <165871467+JH23X@users.noreply.github.com> +Jack Mousseau Jack Mousseau JackJollimore <130917767+JackJollimore@users.noreply.github.com> +Jaeden Amero Jaemin Son Jag Chadha Jakub N @@ -243,10 +306,14 @@ Jannis Schönleber Jared Van Bortel Jared Van Bortel Jason McCartney +Jason Stillerman Jean-Christophe Hoelt Jean-Michaël Celerier Jed Fox +Jeff Bolz +Jeffrey Morgan Jeffrey Quesnelle +Jeroen Mostert Jesse Jojo Johnson Jeximo Jhen-Jie Hong @@ -258,6 +325,9 @@ Jiří Podivín <66251151+jpodivin@users.noreply.github.com> Jiří Sejkora Joan Fontanals Joan Fontanals +João Dinis Ferreira +Joe Eli McIlvain +Joe Todd Johan Johannes Gäßler Johannes Rudolph @@ -274,7 +344,9 @@ Joyce Juan Calderon-Perez <835733+gaby@users.noreply.github.com> Judd Julius Arkenberg +Jun Hee Yoo Jun Jie <71215065+junnjiee16@users.noreply.github.com> +Junil Kim Junyang Lin Juraj Bednar Justin Parker @@ -292,12 +364,14 @@ Karthik Sethuraman Kasumi <90275229+kasumi-1@users.noreply.github.com> Kawrakow <48489457+ikawrakow@users.noreply.github.com> Keiichi Tabata +Keke Han Kenvix ⭐ Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Kevin Gibbons Kevin Ji <1146876+kevinji@users.noreply.github.com> Kevin Kwok Kevin Lo +Kevin Wang Kolen Cheung Konstantin Herud Konstantin Zhuravlyov @@ -315,22 +389,29 @@ LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> Leonardo Neumann Li Tan Linwei Wang +Liu Jia <109258120+Septa2112@users.noreply.github.com> +Liu Jia LoganDark +Loïc Carrère LostRuins <39025047+LostRuins@users.noreply.github.com> Luciano Luo Tian Lyle Dean +M-A M. Yusuf Sarıgöz +Ma Mingfei Maarten ter Huurne Mack Straight Maël Kerbiriou MaggotHATE +Mahesh Madhav <67384846+heshpdx@users.noreply.github.com> Manuel <44313466+makuche@users.noreply.github.com> Marc Köhlbrugge Marco Matthies <71844+marcom@users.noreply.github.com> Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Marian Cepok Mark Fairbairn +Mark Zhuang Marko Tasic Markus Tavenrath Martin Delille @@ -342,11 +423,15 @@ MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com> Mateusz Charytoniuk Matheus C. França Matheus Gabriel Alves Silva +Mathieu Geli Mathieu Nayrolles +Mathijs Henquet Mathijs de Bruin Matt Clayton <156335168+mattjcly@users.noreply.github.com> Matt Pulver +Matt Stephenson Matteo Boschini <12133566+mbosc@users.noreply.github.com> +Matteo Mortari Mattheus Chediak Matthew Tejo Matvey Soloviev @@ -356,8 +441,10 @@ Maxime <672982+maximegmd@users.noreply.github.com> Maximilian Winter Meng Zhang Meng, Hengyu +Mengqing Cao Merrick Christensen Michael Coppola +Michael Francis Michael Hueschen Michael Kesper Michael Klimenko @@ -365,41 +452,57 @@ Michael Podvitskiy Michael Potter Michael de Gans Michaël de Vries +Michał Tuszyński Mihai Mike Mikko Juola Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> +Minsoo Cheong Mirko185 Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> +MistApproach <98988043+MistApproach@users.noreply.github.com> Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Mohammadreza Hendiani Mohammadreza Hendiani +Molly Sophia +MorganRO8 <47795945+MorganRO8@users.noreply.github.com> Murilo Santana Musab Gultekin Nam D. Tran <42194884+namtranase@users.noreply.github.com> Nathan Epstein +Natsu NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> Nebula Neo Zhang <14088817+arthw@users.noreply.github.com> Neo Zhang Neo Zhang Jianyu Neuman Vong +Nexes the Old <124105151+Nexesenex@users.noreply.github.com> Nexesenex <124105151+Nexesenex@users.noreply.github.com> Niall Coates <1349685+Niall-@users.noreply.github.com> +Nicholai Tukanov +Nico Bosshard Nicolai Weitkemper Nicolás Pérez Nigel Bosch Niklas Korz +NikolaiLyssogor <59844691+NikolaiLyssogor@users.noreply.github.com> Nikolas <127742645+nneubacher@users.noreply.github.com> Nindaleth +OSecret <135510162+OLSecret@users.noreply.github.com> Oleksandr Nikitin Oleksii Maryshchenko Olivier Chafik Ondřej Čertík Ouadie EL FAROUKI +PAB +Pablo Duboue +Pascal Patry Patrice Ferlet Paul Tsochantaris +Pavel Zloi Pavol Rusnak +Paweł Wodnicki <151604+32bitmicro@users.noreply.github.com> Pedro Cuenca Peter Sugihara Phil H <5756783+phiharri@users.noreply.github.com> @@ -407,10 +510,15 @@ Philip Taron Phillip Kravtsov Pierre Alexandre SCHEMBRI Pierrick Hymbert +Pieter Ouwerkerk +Plamen Minev +Prashant Vithule <119530321+Vithulep@users.noreply.github.com> Przemysław Pawełczyk Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> Qingyou Meng Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> +R0CKSTAR +R0CKSTAR RJ Adriaansen Radoslav Gerganov Radosław Gryta @@ -419,11 +527,13 @@ Raj Hammeer Singh Hada Ralph Soika Rand Xie Randall Fitzgerald +Random Fly Reinforce-II Ren Xuancheng Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com> RhinoDevel Riceball LEE +Rich Dougherty Richard Kiss Richard Roberson Rick G <26732651+TheFlipbook@users.noreply.github.com> @@ -439,21 +549,30 @@ Robey Holderith Robyn Roger Meier Roland <14355895+rbur0425@users.noreply.github.com> +Romain Biessy Romain D <90720+Artefact2@users.noreply.github.com> Romain Neutron Roman Parykin Ron Evans Ron Jailall +Roni Ronny Brendel Ronsor Rowan Hart +Ruchira Hasaranga +Ruixin Huang <18860020911@163.com> Rune <43761327+Rune-AI@users.noreply.github.com> +RunningLeon +RunningLeon Ryan Landay Ryder Wishart Ryuei Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> +SRHMorris <69468379+SRHMorris@users.noreply.github.com> +SXX SakuraUmi Salvador E. Tropea +Salvatore Mesoraca Sam Spilsbury Sami Farin <3876865+Safari77@users.noreply.github.com> Samuel Maynard @@ -463,23 +582,29 @@ Sebastián A SebastianApel <13675545+SebastianApel@users.noreply.github.com> Senemu <10880819+Senemu@users.noreply.github.com> Sergey Alirzaev +Sergio López Sergio López Sertaç Özercan <852750+sozercan@users.noreply.github.com> SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> ShadovvBeast Shakhar Dasgupta +Shane A Shangning Xu <32517059+xushangning@users.noreply.github.com> +Shankar +Shanshan Shen <467638484@qq.com> Shijie <821898965@qq.com> Shintarou Okada Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> Shouzheng Liu Shuichi Tsutsumi +Shupei Fan Sigbjørn Skjæret Simon Willison Siwen Yu Sky Yan Slaren <2141330+slaren@users.noreply.github.com> Slava Primenko +Small Grass Forest SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com> Someone Someone Serge @@ -491,12 +616,15 @@ Stefan Sydow Steffen Röcker Stephan Walter Stephen Nichols +Steve Bonds Steve Grubb Steven Prichard Steven Roussey Steward Garcia <57494570+FSSRepo@users.noreply.github.com> +StrangeBytesDev <141275258+StrangeBytesDev@users.noreply.github.com> Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> SuperUserNameMan +Sutou Kouhei Tai Duc Nguyen Taikono-Himazin Tameem <113388789+AhmadTameem@users.noreply.github.com> @@ -507,7 +635,9 @@ Theia Vogel Thérence <13496987+Royalphax@users.noreply.github.com> Thibault Terrasson Thomas Klausner +Thorsten Sommer Tim Miller +Tim Wang Timmy Knight Timothy Cronin <40186632+4imothy@users.noreply.github.com> Ting Lou @@ -517,24 +647,31 @@ Tom C Tom Jobbins <784313+TheBloke@users.noreply.github.com> Tomas Tomáš Pazdiora +Tony Wasserka <4840017+neobrain@users.noreply.github.com> Tristan Druyen Tristan Ross +Trivikram Kamat <16024985+trivikr@users.noreply.github.com> Tungsten842 <886724vf@anonaddy.me> Tungsten842 Tushar UEXTM.com <84163508+uextm@users.noreply.github.com> +Ujjawal Panchal <31011628+Ujjawal-K-Panchal@users.noreply.github.com> Ulrich Drepper Uzo Nweke Vaibhav Srivastav Val Kharitonov Valentin Konovalov Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> +Vali Malinoiu <0x4139@gmail.com> Victor Nogueira Victor Z. Peng +Viet-Anh NGUYEN (Andrew) +Vinesh Janarthanan <36610342+VJHack@users.noreply.github.com> Vlad Vladimir Vladimir Malyutin Vladimir Zorin +VoidIsVoid <343750470@qq.com> Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> Weird Constructor @@ -551,15 +688,22 @@ Xiang (Kevin) Li Xiao-Yong Jin XiaotaoChen Xiaoyi Chen +Xie Yanbo Xingchen Song(宋星辰) +Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com> Xuan Son Nguyen +Yaiko Yann Follet <131855179+YannFollet@users.noreply.github.com> Yaroslav Yazan Agha-Schrader Yiming Cui Yishuo Wang +Yoshi Suhara +Yoshi Suhara +Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> Yui +Yuri Khrustalev Yusuf Kağan Hanoğlu Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> ZHAOKAI WANG @@ -568,6 +712,8 @@ Zay <95888118+isaiahbjork@users.noreply.github.com> Zenix Zhang Peiyuan Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com> +Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com> +Zhiyuan Li ZhouYuChen Ziad Ben Hadj-Alouane Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> @@ -581,6 +727,7 @@ alexpinel <93524949+alexpinel@users.noreply.github.com> alonfaraj alwqx amd-lalithnc +amritahs-ibm andrijdavid anon998 <131767832+anon998@users.noreply.github.com> anzz1 @@ -588,14 +735,18 @@ apaz apcameron <37645737+apcameron@users.noreply.github.com> arch-btw <57669023+arch-btw@users.noreply.github.com> arcrank +ardfork <134447697+ardfork@users.noreply.github.com> arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> at8u <129688334+at8u@users.noreply.github.com> automaticcat +awatuna <23447591+awatuna@users.noreply.github.com> +b4b4o bandoti <141645996+bandoti@users.noreply.github.com> beiller bhubbb <79117352+bhubbb@users.noreply.github.com> bmwl bobqianic <129547291+bobqianic@users.noreply.github.com> +brucepro bryanSwk <93190252+bryanSwk@users.noreply.github.com> bsilvereagle bssrdf @@ -614,10 +765,14 @@ cpumaxx <163466046+cpumaxx@users.noreply.github.com> crasm crasm daboe01 +daghanerdonmez <44506702+daghanerdonmez@users.noreply.github.com> +daminho <37615795+daminho@users.noreply.github.com> david raistrick ddh0 ddpasa <112642920+ddpasa@users.noreply.github.com> deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> +devojony <61173062+devojony@users.noreply.github.com> +ditsuke divinity76 dm4 dotpy314 <33351922+dotpy314@users.noreply.github.com> @@ -629,14 +784,18 @@ ebraminio eiery <19350831+eiery@users.noreply.github.com> eric8607242 fairydreaming <166155368+fairydreaming@users.noreply.github.com> +fengerhu1 <2748250768@qq.com> fraxy-v <65565042+fraxy-v@users.noreply.github.com> github-actions[bot] gliptic goerch grahameth <96447521+grahameth@users.noreply.github.com> +gtygo gwjr <502526+gwjr@users.noreply.github.com> h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> hankcs +haopeng <657407891@qq.com> +hipudding hoangmit hongbo.mo <352280764@qq.com> hopkins385 <98618192+hopkins385@users.noreply.github.com> @@ -649,12 +808,14 @@ hxer7963 hydai iSma iacore <74560659+iacore@users.noreply.github.com> +icppWorld <124377669+icppWorld@users.noreply.github.com> igarnier intelmatt <61025942+intelmatt@users.noreply.github.com> iohub jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> jaime-m-p <167997752+jaime-m-p@users.noreply.github.com> jameswu2014 <545426914@qq.com> +jdomke <28772296+jdomke@users.noreply.github.com> jiez <373447296@qq.com> jneem joecryptotoo <80373433+joecryptotoo@users.noreply.github.com> @@ -677,28 +838,35 @@ klosax <131523366+klosax@users.noreply.github.com> kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> kunnis kuronekosaiko +kustaaya <58045274+kustaaya@users.noreply.github.com> kuvaus <22169537+kuvaus@users.noreply.github.com> kwin1412 <42286931+kwin1412@users.noreply.github.com> l3utterfly +laik ldwang le.chang leejet +leo-pony limitedAtonement liuwei-git <14815172+liuwei-git@users.noreply.github.com> lon <114724657+longregen@users.noreply.github.com> loonerin <132926317+loonerin@users.noreply.github.com> +ltoniazzi <61414566+ltoniazzi@users.noreply.github.com> luoyu-intel m3ndax maddes8cht <55592906+maddes8cht@users.noreply.github.com> makomk manikbhandari maor-ps <154728172+maor-ps@users.noreply.github.com> +matiaslin <45382001+matiaslin@users.noreply.github.com> +matteo mdrokz mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> minarchist mj-shifu <77107165+mj-shifu@users.noreply.github.com> mmyjona momonga <115213907+mmnga@users.noreply.github.com> +momonga <146910567+mmngays@users.noreply.github.com> moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> mzcu nanahi <130121847+na-na-hi@users.noreply.github.com> @@ -716,8 +884,10 @@ omahs <73983677+omahs@users.noreply.github.com> oobabooga <112222186+oobabooga@users.noreply.github.com> opparco ostix360 <55257054+ostix360@users.noreply.github.com> +pculliton pengxin99 perserk +piDack <104877312+piDack@users.noreply.github.com> pmysl postmasters pudepiedj @@ -733,6 +903,7 @@ runfuture sandyiscool sasha0552 semidark +serhii-nakon <57632032+serhii-nakon@users.noreply.github.com> sharpHL <132747147+sharpHL@users.noreply.github.com> shibe2 singularity <12184989+singularity-s0@users.noreply.github.com> @@ -741,42 +912,55 @@ sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com> slaren <2141330+slaren@users.noreply.github.com> slaren snadampal <87143774+snadampal@users.noreply.github.com> +standby24x7 staviq stduhpf strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com> swittk takov751 <40316768+takov751@users.noreply.github.com> tarcey +tc-mb <157115220+tc-mb@users.noreply.github.com> texmex76 <40733439+texmex76@users.noreply.github.com> thement <40525767+thement@users.noreply.github.com> +thewh1teagle <61390950+thewh1teagle@users.noreply.github.com> tjohnman +toyer <2042519524@qq.com> tslmy ubik2 uint256_t uint256_t unbounded +uvos valiray <133289098+valiray@users.noreply.github.com> +vb vik viric vodkaslime <646329483@qq.com> vvhg1 <94630311+vvhg1@users.noreply.github.com> vxiiduu <73044267+vxiiduu@users.noreply.github.com> +wangshuai09 <391746016@qq.com> wbpxre150 <100937007+wbpxre150@users.noreply.github.com> whoreson <139810751+whoreson@users.noreply.github.com> woachk <24752637+woachk@users.noreply.github.com> wonjun Jang woodx <124784234+woodx9@users.noreply.github.com> +wwoodsTM <104587230+wwoodsTM@users.noreply.github.com> wzy <32936898+Freed-Wu@users.noreply.github.com> xaedes xaedes +xctan xloem <0xloem@gmail.com> yangli2 yuiseki +yuri@FreeBSD zakkor zhangkaihuo +zhentaoyu zhouwg <6889919+zhouwg@users.noreply.github.com> zhouwg zrm Ștefan-Gabriel Muscalu +杨朱 · Kiki 源文雨 <41315874+fumiama@users.noreply.github.com> +蕭澧邦 <45505768+shou692199@users.noreply.github.com> Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a440acaa6d2f..08cd15594a6e1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,6 +97,7 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake) # override ggml options set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD}) @@ -178,8 +179,11 @@ if (GGML_TARGET_DEFINES) list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES}) endif() get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES) - -set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h) +# all public headers +set(LLAMA_PUBLIC_HEADERS + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h + ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h) +set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}") install(TARGETS llama LIBRARY PUBLIC_HEADER) configure_package_config_file( diff --git a/Makefile b/Makefile index dd6d864ad513a..e308216240c8e 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ BUILD_TARGETS = \ llama-server \ llama-simple \ llama-simple-chat \ + llama-run \ llama-speculative \ llama-tokenize \ llama-vdot \ @@ -250,11 +251,11 @@ endif # Compile flags # -# keep standard at C11 and C++11 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon +# keep standard at C11 and C++17 +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CFLAGS = -std=c11 -fPIC -MK_CXXFLAGS = -std=c++11 -fPIC -MK_NVCCFLAGS = -std=c++11 +MK_CXXFLAGS = -std=c++17 -fPIC +MK_NVCCFLAGS = -std=c++17 ifdef LLAMA_NO_CCACHE GGML_NO_CCACHE := 1 @@ -290,6 +291,7 @@ endif # some memory allocation are available on Linux through GNU extensions in libc ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GNU_SOURCE + MK_LDFLAGS += -ldl endif # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, @@ -573,9 +575,12 @@ endif ifndef GGML_NO_AMX MK_CPPFLAGS += -DGGML_USE_AMX - OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o + OBJ_GGML_EXT += ggml/src/ggml-cpu/amx/amx.o ggml/src/ggml-cpu/amx/mmq.o endif +# only necessary for the CPU backend files +MK_CPPFLAGS += -Iggml/src/ggml-cpu + ifdef GGML_RPC MK_CPPFLAGS += -DGGML_USE_RPC OBJ_GGML_EXT += ggml/src/ggml-rpc.o @@ -750,7 +755,7 @@ vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp endif # GGML_VULKAN -ifdef GGML_HIPBLAS +ifdef GGML_HIP ifeq ($(wildcard /opt/rocm),) ROCM_PATH ?= /usr AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) @@ -805,7 +810,7 @@ ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-common.h \ ggml/src/ggml-cuda/common.cuh $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< -endif # GGML_HIPBLAS +endif # GGML_HIP ifdef GGML_MUSA ifeq ($(wildcard /opt/musa),) @@ -813,7 +818,7 @@ ifdef GGML_MUSA else MUSA_PATH ?= /opt/musa endif - MTGPU_TARGETS ?= mp_21 mp_22 + MUSA_ARCHITECTURES ?= 21;22 MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib @@ -832,7 +837,8 @@ ifdef GGML_MUSA CXX := $(MUSA_PATH)/bin/clang++ MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc - MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS)) + MUSAFLAGS = -x musa -mtgpu + MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch)) ifdef GGML_CUDA_FORCE_MMQ MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ @@ -876,14 +882,14 @@ ggml/src/ggml-cuda/ggml-cuda.o: \ ggml/src/ggml-backend-impl.h \ ggml/src/ggml-common.h \ $(wildcard ggml/src/ggml-cuda/*.cuh) - $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< + $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $< ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/%.cu \ ggml/include/ggml.h \ ggml/src/ggml-common.h \ ggml/src/ggml-cuda/common.cuh - $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $< + $(MCC) $(CXXFLAGS) $(MUSAFLAGS) -c -o $@ $< endif # GGML_MUSA ifdef GGML_METAL @@ -1166,6 +1172,11 @@ llama-infill: examples/infill/infill.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +llama-run: examples/run/run.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + llama-simple: examples/simple/simple.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) diff --git a/Package.swift b/Package.swift index 6b68aecdebec1..1e75aa7e2538b 100644 --- a/Package.swift +++ b/Package.swift @@ -28,13 +28,16 @@ var cSettings: [CSetting] = [ .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .unsafeFlags(["-fno-objc-arc"]), .headerSearchPath("ggml/src"), + .headerSearchPath("ggml/src/ggml-cpu"), // NOTE: NEW_LAPACK will required iOS version 16.4+ // We should consider add this in the future when we drop support for iOS 14 // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) // .define("ACCELERATE_NEW_LAPACK"), // .define("ACCELERATE_LAPACK_ILP64") + .define("GGML_USE_CPU"), ] + #if canImport(Darwin) sources.append("ggml/src/ggml-common.h") sources.append("ggml/src/ggml-metal/ggml-metal.m") @@ -43,7 +46,7 @@ linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL") + .define("GGML_USE_METAL"), ] ) #endif diff --git a/README.md b/README.md index 5f7933c132dc3..37c6666aac122 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,6 @@ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) -[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) @@ -26,7 +25,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ## Description The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide -variety of hardware - locally and in the cloud. +range of hardware - locally and in the cloud. - Plain C/C++ implementation without any dependencies - Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks @@ -36,14 +35,17 @@ variety of hardware - locally and in the cloud. - Vulkan and SYCL backend support - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity -Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has -improved significantly thanks to many contributions. It is the main playground for developing new features for the -[ggml](https://github.com/ggerganov/ggml) library. +The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library. -**Supported models:** +
+Models Typically finetunes of the base models below are supported as well. +Instructions for adding support for new models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md) + +**Text-only:** + - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 - [x] LLaMA 3 🦙🦙🦙 @@ -79,6 +81,7 @@ Typically finetunes of the base models below are supported as well. - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [OLMo](https://allenai.org/olmo) +- [x] [OLMo 2](https://allenai.org/olmo) - [x] [OLMoE](https://huggingface.co/allenai/OLMoE-1B-7B-0924) - [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330) - [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia) @@ -96,9 +99,7 @@ Typically finetunes of the base models below are supported as well. - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) -(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md)) - -**Multimodal models:** +**Multimodal:** - [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) - [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava) @@ -110,7 +111,10 @@ Typically finetunes of the base models below are supported as well. - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Bunny](https://github.com/BAAI-DCAI/Bunny) -**Bindings:** +
+ +
+Bindings - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) @@ -137,173 +141,121 @@ Typically finetunes of the base models below are supported as well. - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift) - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama) -**UI:** +
-Unless otherwise noted these projects are open-source with permissive licensing: +
+UIs -- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) -- [iohub/collama](https://github.com/iohub/coLLaMA) +*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* + +- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) +- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) +- [Dot](https://github.com/alexpinel/Dot) (GPL) +- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) +- [iohub/collama](https://github.com/iohub/coLLaMA) (Apache-2.0) - [janhq/jan](https://github.com/janhq/jan) (AGPL) -- [nat/openplayground](https://github.com/nat/openplayground) -- [Faraday](https://faraday.dev/) (proprietary) +- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file) (Apache-2.0) +- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) +- [llama.vim](https://github.com/ggml-org/llama.vim) (MIT) +- [LARS](https://github.com/abgulati/LARS) (AGPL) +- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) +- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) +- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) - [LMStudio](https://lmstudio.ai/) (proprietary) -- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary) -- [ramalama](https://github.com/containers/ramalama) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) -- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) -- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) -- [ollama/ollama](https://github.com/ollama/ollama) +- [MindMac](https://mindmac.app) (proprietary) +- [MindWorkAI/AI-Studio](https://github.com/MindWorkAI/AI-Studio) (FSL-1.1-MIT) +- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) +- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) (Apache-2.0) +- [nat/openplayground](https://github.com/nat/openplayground) (MIT) +- [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all) (MIT) +- [ollama/ollama](https://github.com/ollama/ollama) (MIT) - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui) (AGPL) -- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) -- [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) -- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) +- [PocketPal AI](https://github.com/a-ghorbani/pocketpal-ai) (MIT) +- [psugihara/FreeChat](https://github.com/psugihara/FreeChat) (MIT) +- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) (MIT) - [pythops/tenere](https://github.com/pythops/tenere) (AGPL) -- [RAGNA Desktop](https://ragna.app/) (proprietary) -- [RecurseChat](https://recurse.chat/) (proprietary) -- [semperai/amica](https://github.com/semperai/amica) -- [withcatai/catai](https://github.com/withcatai/catai) -- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) -- [Msty](https://msty.app) (proprietary) -- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) -- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later) -- [Dot](https://github.com/alexpinel/Dot) (GPL) -- [MindMac](https://mindmac.app) (proprietary) -- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) -- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) -- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) -- [AIKit](https://github.com/sozercan/aikit) (MIT) -- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL) -- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT) -- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL) -- [PocketPal AI - An iOS and Android App](https://github.com/a-ghorbani/pocketpal-ai) (MIT) +- [ramalama](https://github.com/containers/ramalama) (MIT) +- [semperai/amica](https://github.com/semperai/amica) (MIT) +- [withcatai/catai](https://github.com/withcatai/catai) (MIT) -*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* +
-**Tools:** +
+Tools - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML - [akx/ollama-dl](https://github.com/akx/ollama-dl) – download models from the Ollama library to be used directly with llama.cpp - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption - [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage -- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with prebuild Mobile and Web platform wrappers and a model example) +- [Styled Lines](https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902) (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example) -**Infrastructure:** +
+ +
+Infrastructure - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly -**Games:** -- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you. - -## Demo +
-Typical run using LLaMA v2 13B on M2 Ultra +Games -``` -$ make -j && ./llama-cli -m models/llama-13b-v2/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -I llama.cpp build info: -I UNAME_S: Darwin -I UNAME_P: arm -I UNAME_M: arm64 -I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_K_QUANTS -DGGML_USE_ACCELERATE -I CXXFLAGS: -I. -I./common -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -DGGML_USE_K_QUANTS -I LDFLAGS: -framework Accelerate -I CC: Apple clang version 14.0.3 (clang-1403.0.22.14.1) -I CXX: Apple clang version 14.0.3 (clang-1403.0.22.14.1) - -make: Nothing to be done for `default'. -main: build = 1041 (cf658ad) -main: seed = 1692823051 -llama_model_loader: loaded meta data with 16 key-value pairs and 363 tensors from models/llama-13b-v2/ggml-model-q4_0.gguf (version GGUF V1 (latest)) -llama_model_loader: - type f32: 81 tensors -llama_model_loader: - type q4_0: 281 tensors -llama_model_loader: - type q6_K: 1 tensors -llm_load_print_meta: format = GGUF V1 (latest) -llm_load_print_meta: arch = llama -llm_load_print_meta: vocab type = SPM -llm_load_print_meta: n_vocab = 32000 -llm_load_print_meta: n_merges = 0 -llm_load_print_meta: n_ctx_train = 4096 -llm_load_print_meta: n_ctx = 512 -llm_load_print_meta: n_embd = 5120 -llm_load_print_meta: n_head = 40 -llm_load_print_meta: n_head_kv = 40 -llm_load_print_meta: n_layer = 40 -llm_load_print_meta: n_rot = 128 -llm_load_print_meta: n_gqa = 1 -llm_load_print_meta: f_norm_eps = 1.0e-05 -llm_load_print_meta: f_norm_rms_eps = 1.0e-05 -llm_load_print_meta: n_ff = 13824 -llm_load_print_meta: freq_base = 10000.0 -llm_load_print_meta: freq_scale = 1 -llm_load_print_meta: model type = 13B -llm_load_print_meta: model ftype = mostly Q4_0 -llm_load_print_meta: model size = 13.02 B -llm_load_print_meta: general.name = LLaMA v2 -llm_load_print_meta: BOS token = 1 '' -llm_load_print_meta: EOS token = 2 '' -llm_load_print_meta: UNK token = 0 '' -llm_load_print_meta: LF token = 13 '<0x0A>' -llm_load_tensors: ggml ctx size = 0.11 MB -llm_load_tensors: mem required = 7024.01 MB (+ 400.00 MB per state) -................................................................................................... -llama_new_context_with_model: kv self size = 400.00 MB -llama_new_context_with_model: compute buffer total size = 75.41 MB - -system_info: n_threads = 16 / 24 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | -sampling: repeat_last_n = 64, repeat_penalty = 1.100000, presence_penalty = 0.000000, frequency_penalty = 0.000000, top_k = 40, tfs_z = 1.000000, top_p = 0.950000, typical_p = 1.000000, temp = 0.800000, mirostat = 0, mirostat_lr = 0.100000, mirostat_ent = 5.000000 -generate: n_ctx = 512, n_batch = 512, n_predict = 400, n_keep = 0 - - - Building a website can be done in 10 simple steps: -Step 1: Find the right website platform. -Step 2: Choose your domain name and hosting plan. -Step 3: Design your website layout. -Step 4: Write your website content and add images. -Step 5: Install security features to protect your site from hackers or spammers -Step 6: Test your website on multiple browsers, mobile devices, operating systems etc… -Step 7: Test it again with people who are not related to you personally – friends or family members will work just fine! -Step 8: Start marketing and promoting the website via social media channels or paid ads -Step 9: Analyze how many visitors have come to your site so far, what type of people visit more often than others (e.g., men vs women) etc… -Step 10: Continue to improve upon all aspects mentioned above by following trends in web design and staying up-to-date on new technologies that can enhance user experience even further! -How does a Website Work? -A website works by having pages, which are made of HTML code. This code tells your computer how to display the content on each page you visit – whether it’s an image or text file (like PDFs). In order for someone else’s browser not only be able but also want those same results when accessing any given URL; some additional steps need taken by way of programming scripts that will add functionality such as making links clickable! -The most common type is called static HTML pages because they remain unchanged over time unless modified manually (either through editing files directly or using an interface such as WordPress). They are usually served up via HTTP protocols – this means anyone can access them without having any special privileges like being part of a group who is allowed into restricted areas online; however, there may still exist some limitations depending upon where one lives geographically speaking. -How to -llama_print_timings: load time = 576.45 ms -llama_print_timings: sample time = 283.10 ms / 400 runs ( 0.71 ms per token, 1412.91 tokens per second) -llama_print_timings: prompt eval time = 599.83 ms / 19 tokens ( 31.57 ms per token, 31.68 tokens per second) -llama_print_timings: eval time = 24513.59 ms / 399 runs ( 61.44 ms per token, 16.28 tokens per second) -llama_print_timings: total time = 25431.49 ms -``` +- [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.
-
-Demo of running both LLaMA-7B and whisper.cpp on a single M1 Pro MacBook +## Supported backends + +| Backend | Target devices | +| --- | --- | +| [Metal](./docs/build.md#metal-build) | Apple Silicon | +| [BLAS](./docs/build.md#blas-build) | All | +| [BLIS](./docs/backend/BLIS.md) | All | +| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | +| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU | +| [CUDA](./docs/build.md#cuda) | Nvidia GPU | +| [hipBLAS](./docs/build.md#hipblas) | AMD GPU | +| [Vulkan](./docs/build.md#vulkan) | GPU | +| [CANN](./docs/build.md#cann) | Ascend NPU | -And here is another demo of running both LLaMA-7B and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) on a single M1 Pro MacBook: +## Building and usage -https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8b4f-add84093ffff.mp4 +The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h). +The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries: -
+- Clone this repository and build locally, see [how to build](./docs/build.md) +- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](./docs/install.md) +- Use a Docker image, see [documentation for Docker](./docs/docker.md) +- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases) + +### Obtaining and quantizing models + +The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`: + +- [Trending](https://huggingface.co/models?library=gguf&sort=trending) +- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf) + +After downloading a model, use the CLI tools to run it locally - see below. + +`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo. -## Usage +The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`: -Here are the end-to-end binary build and model conversion steps for most supported models. +- Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes +- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123) +- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268) +- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669) -### Basic usage +To learn more about model quantization, [read this documentation](./examples/quantize/README.md) -Firstly, you need to get the binary. There are different methods that you can follow: -- Method 1: Clone this repository and build locally, see [how to build](./docs/build.md) -- Method 2: If you are using MacOS or Linux, you can install llama.cpp via [brew, flox or nix](./docs/install.md) -- Method 3: Use a Docker image, see [documentation for Docker](./docs/docker.md) -- Method 4: Download pre-built binary from [releases](https://github.com/ggerganov/llama.cpp/releases) +### Using the `llama-cli` tool -You can run a basic completion using this command: +Run a basic text completion: ```bash llama-cli -m your_model.gguf -p "I believe the meaning of life is" -n 128 @@ -316,7 +268,7 @@ See [this page](./examples/main/README.md) for a full list of parameters. ### Conversation mode -If you want a more ChatGPT-like experience, you can run in conversation mode by passing `-cnv` as a parameter: +Run `llama-cli` in conversation/chat mode by passing the `-cnv` parameter: ```bash llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv @@ -332,118 +284,43 @@ llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv By default, the chat template will be taken from the input model. If you want to use another chat template, pass `--chat-template NAME` as a parameter. See the list of [supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) ```bash -./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml +llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml ``` You can also use your own template via in-prefix, in-suffix and reverse-prompt parameters: ```bash -./llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:' -``` - -### Web server - -[llama.cpp web server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. - -Example usage: - -```bash -./llama-server -m your_model.gguf --port 8080 - -# Basic web UI can be accessed via browser: http://localhost:8080 -# Chat completion endpoint: http://localhost:8080/v1/chat/completions -``` - -### Interactive mode - -> [!NOTE] -> If you prefer basic usage, please consider using conversation mode instead of interactive mode - -In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. - -Here is an example of a few-shot interaction, invoked with the command - -```bash -# default arguments using a 7B model -./examples/chat.sh - -# advanced chat with a 13B model -./examples/chat-13B.sh - -# custom arguments using a 13B model -./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt -``` - -Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `llama-cli` example program. - -![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) - -### Persistent Interaction - -The prompt, user inputs, and model generations can be saved and resumed across calls to `./llama-cli` by leveraging `--prompt-cache` and `--prompt-cache-all`. The `./examples/chat-persistent.sh` script demonstrates this with support for long-running, resumable chat sessions. To use this example, you must provide a file to cache the initial chat prompt and a directory to save the chat session, and may optionally provide the same variables as `chat-13B.sh`. The same prompt cache can be reused for new chat sessions. Note that both prompt cache and chat directory are tied to the initial prompt (`PROMPT_TEMPLATE`) and the model file. - -```bash -# Start a new chat -PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh - -# Resume that chat -PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/default ./examples/chat-persistent.sh - -# Start a different chat with the same prompt/model -PROMPT_CACHE_FILE=chat.prompt.bin CHAT_SAVE_DIR=./chat/another ./examples/chat-persistent.sh - -# Different prompt cache for different prompt/model -PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \ - CHAT_SAVE_DIR=./chat/bob ./examples/chat-persistent.sh +llama-cli -m your_model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:' ``` ### Constrained output with grammars -`llama.cpp` supports grammars to constrain model output. For example, you can force the model to output JSON only: +`llama.cpp` can constrain the output of the model via custom grammars. For example, you can force the model to output only JSON: ```bash -./llama-cli -m ./models/13B/ggml-model-q4_0.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' +llama-cli -m your_model.gguf -n 256 --grammar-file grammars/json.gbnf -p 'Request: schedule a call at 8pm; Command:' ``` The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md). -For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one. +For authoring more complex JSON grammars, check out https://grammar.intrinsiclabs.ai/ -## Build +### Web server (`llama-server`) -Please refer to [Build llama.cpp locally](./docs/build.md) +The [llama-server](./examples/server/README.md) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. -## Supported backends - -| Backend | Target devices | -| --- | --- | -| [Metal](./docs/build.md#metal-build) | Apple Silicon | -| [BLAS](./docs/build.md#blas-build) | All | -| [BLIS](./docs/backend/BLIS.md) | All | -| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | -| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU | -| [CUDA](./docs/build.md#cuda) | Nvidia GPU | -| [hipBLAS](./docs/build.md#hipblas) | AMD GPU | -| [Vulkan](./docs/build.md#vulkan) | GPU | -| [CANN](./docs/build.md#cann) | Ascend NPU | - -## Tools - -### Prepare and Quantize - -> [!NOTE] -> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours. - -To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. +Example usage: -Note: `convert.py` has been moved to `examples/convert_legacy_llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives. -It does not support LLaMA 3, you can use `convert_hf_to_gguf.py` with LLaMA 3 downloaded from Hugging Face. +```bash +llama-server -m your_model.gguf --port 8080 -To learn more about quantizing model, [read this documentation](./examples/quantize/README.md) +# Basic web UI can be accessed via browser: http://localhost:8080 +# Chat completion endpoint: http://localhost:8080/v1/chat/completions +``` ### Perplexity (measuring model quality) -You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better). +Use the `llama-perplexity` tool to measure perplexity over a given prompt (lower perplexity is better). For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). To learn more how to measure perplexity using llama.cpp, [read this documentation](./examples/perplexity/README.md) @@ -463,7 +340,6 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio - [main (cli)](./examples/main/README.md) - [server](./examples/server/README.md) -- [jeopardy](./examples/jeopardy/README.md) - [GBNF grammars](./grammars/README.md) **Development documentation** diff --git a/cmake/common.cmake b/cmake/common.cmake new file mode 100644 index 0000000000000..0f54871e4143d --- /dev/null +++ b/cmake/common.cmake @@ -0,0 +1,33 @@ +function(llama_add_compile_flags) + if (LLAMA_FATAL_WARNINGS) + if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + list(APPEND C_FLAGS -Werror) + list(APPEND CXX_FLAGS -Werror) + elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + add_compile_options(/WX) + endif() + endif() + + if (LLAMA_ALL_WARNINGS) + if (NOT MSVC) + list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes + -Werror=implicit-int -Werror=implicit-function-declaration) + + list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn) + + list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function) + + list(APPEND C_FLAGS ${WARNING_FLAGS}) + list(APPEND CXX_FLAGS ${WARNING_FLAGS}) + + ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}) + + add_compile_options("$<$:${C_FLAGS};${GF_C_FLAGS}>" + "$<$:${CXX_FLAGS};${GF_CXX_FLAGS}>") + else() + # todo : msvc + set(C_FLAGS "" PARENT_SCOPE) + set(CXX_FLAGS "" PARENT_SCOPE) + endif() + endif() +endfunction() diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 9181ac62739b9..d675c47593be0 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -12,6 +12,8 @@ set(CMAKE_CXX_FLAGS_RELEASE "-Ofast") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) find_package(Threads REQUIRED) +llama_add_compile_flags() + # Build info header # @@ -96,5 +98,5 @@ if (LLAMA_CURL) endif () target_include_directories(${TARGET} PUBLIC .) -target_compile_features (${TARGET} PUBLIC cxx_std_11) +target_compile_features (${TARGET} PUBLIC cxx_std_17) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/common/arg.cpp b/common/arg.cpp index 32240f21f2469..32d9a964c1716 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -128,7 +128,11 @@ static void common_params_handle_model_default(common_params & params) { } params.hf_file = params.model; } else if (params.model.empty()) { - params.model = fs_get_cache_file(string_split(params.hf_file, '/').back()); + // this is to avoid different repo having same file name, or same file name in different subdirs + std::string filename = params.hf_repo + "_" + params.hf_file; + // to make sure we don't have any slashes in the filename + string_replace_all(filename, "/", "_"); + params.model = fs_get_cache_file(filename); } } else if (!params.model_url.empty()) { if (params.model.empty()) { @@ -298,6 +302,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) { print_options(specific_options); } +static std::vector parse_device_list(const std::string & value) { + std::vector devices; + auto dev_names = string_split(value, ','); + if (dev_names.empty()) { + throw std::invalid_argument("no devices specified"); + } + if (dev_names.size() == 1 && dev_names[0] == "none") { + devices.push_back(nullptr); + } else { + for (const auto & device : dev_names) { + auto * dev = ggml_backend_dev_by_name(device.c_str()); + if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) { + throw std::invalid_argument(string_format("invalid device: %s", device.c_str())); + } + devices.push_back(dev); + } + devices.push_back(nullptr); + } + return devices; +} + bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { auto ctx_arg = common_params_parser_init(params, ex, print_usage); const common_params params_org = ctx_arg.params; // the example can modify the default params @@ -324,6 +349,9 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e } common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) { + // load dynamic backends + ggml_backend_load_all(); + common_params_context ctx_arg(params); ctx_arg.print_usage = print_usage; ctx_arg.ex = ex; @@ -1312,14 +1340,39 @@ common_params_context common_params_parser_init(common_params & params, llama_ex else { throw std::invalid_argument("invalid value"); } } ).set_env("LLAMA_ARG_NUMA")); + add_opt(common_arg( + {"-dev", "--device"}, "", + "comma-separated list of devices to use for offloading (none = don't offload)\n" + "use --list-devices to see a list of available devices", + [](common_params & params, const std::string & value) { + params.devices = parse_device_list(value); + } + ).set_env("LLAMA_ARG_DEVICE")); + add_opt(common_arg( + {"--list-devices"}, + "print list of available devices and exit", + [](common_params &) { + printf("Available devices:\n"); + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } + } + exit(0); + } + )); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", [](common_params & params, int value) { params.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); + fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } ).set_env("LLAMA_ARG_N_GPU_LAYERS")); @@ -1336,10 +1389,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { throw std::invalid_argument("invalid value"); @@ -2042,14 +2091,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.speculative.n_ctx = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); + add_opt(common_arg( + {"-devd", "--device-draft"}, "", + "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n" + "use --list-devices to see a list of available devices", + [](common_params & params, const std::string & value) { + params.speculative.devices = parse_device_list(value); + } + ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N", "number of layers to store in VRAM for the draft model", [](common_params & params, int value) { params.speculative.n_gpu_layers = value; if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n"); + fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n"); } } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER})); diff --git a/common/common.cpp b/common/common.cpp index c398329d05bf5..6143516d2250f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -652,7 +652,17 @@ bool fs_validate_filename(const std::string & filename) { std::u32string filename_utf32; try { +#if defined(__clang__) + // disable C++17 deprecation warning for std::codecvt_utf8 +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif std::wstring_convert, char32_t> converter; + +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + filename_utf32 = converter.from_bytes(filename); // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used, @@ -829,9 +839,9 @@ struct common_init_result common_init_from_params(common_params & params) { llama_model * model = nullptr; if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams); } else if (!params.model_url.empty()) { - model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams); + model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); } else { model = llama_load_model_from_file(params.model.c_str(), mparams); } @@ -979,9 +989,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector devices; // devices to use for offloading int32_t n_ctx = 0; // draft context size int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding @@ -178,9 +179,6 @@ struct common_params { int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width int32_t n_print = -1; // print token count every n tokens (-1 = disabled) @@ -193,6 +191,13 @@ struct common_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = 0.1f; // KV cache defragmentation threshold + // offload params + std::vector devices; // devices to use for offloading + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -201,7 +206,6 @@ struct common_params { ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; - enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings @@ -462,12 +466,21 @@ struct common_init_result { struct common_init_result common_init_from_params(common_params & params); -struct llama_model_params common_model_params_to_llama (const common_params & params); +struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); -struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params); -struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params); +struct llama_model * common_load_model_from_url( + const std::string & model_url, + const std::string & local_path, + const std::string & hf_token, + const struct llama_model_params & params); +struct llama_model * common_load_model_from_hf( + const std::string & repo, + const std::string & remote_path, + const std::string & local_path, + const std::string & hf_token, + const struct llama_model_params & params); // clear LoRA adapters from context, then apply new list of adapters void common_lora_adapters_apply(struct llama_context * ctx, std::vector & lora_adapters); diff --git a/common/speculative.cpp b/common/speculative.cpp index fe315a2703e9c..e559675c436ef 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -90,9 +90,10 @@ bool common_speculative_are_compatible( if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || llama_token_bos(model_tgt) != llama_token_bos(model_dft) || - llama_token_eos(model_tgt) != llama_token_eos(model_dft) - ) { + llama_token_eos(model_tgt) != llama_token_eos(model_dft)) { LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__); + LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt)); + LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft)); return false; } diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 80a179b86af7e..b931049d11e2d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3040,9 +3040,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@Model.register("Olmo1124ForCausalLM") -class Olmo1124Model(Model): - model_arch = gguf.MODEL_ARCH.OLMO_1124 +@Model.register("Olmo2ForCausalLM") +class Olmo2Model(Model): + model_arch = gguf.MODEL_ARCH.OLMO2 @Model.register("OlmoeForCausalLM") diff --git a/docs/android.md b/docs/android.md index 320b62240382f..47530c6c1d478 100644 --- a/docs/android.md +++ b/docs/android.md @@ -23,10 +23,10 @@ $ curl -L {model-url} -o ~/{model}.gguf Then, if you are not already in the repo directory, `cd` into `llama.cpp` and: ``` -$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" +$ ./build/bin/llama-cli -m ~/{model}.gguf -c {context-size} -p "{your-prompt}" ``` -Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. +Here, we show `llama-cli`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal. To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone: diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 6bdd9d2daab90..23f10175a6b2d 100644 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -23,6 +23,8 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi ## News +- 2024.11 + - Support F16 and F32 data type model for Ascend 310P NPU. - 2024.8 - Support `Q4_0` and `Q8_0` data type for Ascend NPU. - 2024.7 @@ -40,9 +42,11 @@ The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the abi ### Ascend NPU **Verified devices** + | Ascend NPU | Status | |:-----------------------------:|:-------:| | Atlas 300T A2 | Support | +| Atlas 300I Duo | Support | *Notes:* diff --git a/docs/build.md b/docs/build.md index 359952b30ff93..72b81043796dc 100644 --- a/docs/build.md +++ b/docs/build.md @@ -221,7 +221,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm - Using `make`: ```bash - make GGML_HIPBLAS=1 + make GGML_HIP=1 ``` - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash @@ -249,7 +249,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm - Using `make` (example for target gfx1030, build with 16 CPU threads): ```bash - make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 + make -j16 GGML_HIP=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 ``` - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9957ced091224..0cf323e78bd6b 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -16,19 +16,20 @@ find_package(Threads REQUIRED) # ... +# flags + +llama_add_compile_flags() + # examples include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(export-lora) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -37,29 +38,36 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) - add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (GGML_RPC) - add_subdirectory(rpc) - endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) + add_subdirectory(server) endif() add_subdirectory(save-load-state) + add_subdirectory(run) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + add_subdirectory(quantize-stats) + add_subdirectory(llava) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) + endif() + endif() endif() diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt index 959acaeeebc38..68ad707f32c98 100644 --- a/examples/batched-bench/CMakeLists.txt +++ b/examples/batched-bench/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-batched-bench) add_executable(${TARGET} batched-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/batched/CMakeLists.txt b/examples/batched/CMakeLists.txt index 77e33343b6673..0d439f49842b5 100644 --- a/examples/batched/CMakeLists.txt +++ b/examples/batched/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-batched) add_executable(${TARGET} batched.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/convert-llama2c-to-ggml/CMakeLists.txt b/examples/convert-llama2c-to-ggml/CMakeLists.txt index a6790e617217e..44e5f722a9739 100644 --- a/examples/convert-llama2c-to-ggml/CMakeLists.txt +++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-convert-llama2c-to-ggml) add_executable(${TARGET} convert-llama2c-to-ggml.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/cvector-generator/CMakeLists.txt b/examples/cvector-generator/CMakeLists.txt index 0a559d60c2a6d..49ad9561c82ea 100644 --- a/examples/cvector-generator/CMakeLists.txt +++ b/examples/cvector-generator/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-cvector-generator) add_executable(${TARGET} cvector-generator.cpp pca.hpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 8256e789ad33a..809040307d2c9 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-embedding) add_executable(${TARGET} embedding.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index a48753d38e16e..95915ed91c099 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -2,8 +2,9 @@ set(TARGET llama-eval-callback) add_executable(${TARGET} eval-callback.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} + COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/export-lora/CMakeLists.txt b/examples/export-lora/CMakeLists.txt index 1cef6e71694e2..310455787a7ef 100644 --- a/examples/export-lora/CMakeLists.txt +++ b/examples/export-lora/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-export-lora) add_executable(${TARGET} export-lora.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt index 4edd6ec7394c5..d2cb524c0a7f7 100644 --- a/examples/gbnf-validator/CMakeLists.txt +++ b/examples/gbnf-validator/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gbnf-validator) add_executable(${TARGET} gbnf-validator.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gen-docs/CMakeLists.txt b/examples/gen-docs/CMakeLists.txt index c94cda7764341..25de0af35df60 100644 --- a/examples/gen-docs/CMakeLists.txt +++ b/examples/gen-docs/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gen-docs) add_executable(${TARGET} gen-docs.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gguf-hash/CMakeLists.txt b/examples/gguf-hash/CMakeLists.txt index 633f4553594bb..15c5c68c6f402 100644 --- a/examples/gguf-hash/CMakeLists.txt +++ b/examples/gguf-hash/CMakeLists.txt @@ -4,12 +4,19 @@ install(TARGETS ${TARGET} RUNTIME) # clibs dependencies include_directories(deps/) + add_library(xxhash OBJECT deps/xxhash/xxhash.c deps/xxhash/xxhash.h) target_link_libraries(${TARGET} PRIVATE xxhash) + add_library(sha1 OBJECT deps/sha1/sha1.c deps/sha1/sha1.h) target_link_libraries(${TARGET} PRIVATE sha1) +if (NOT MSVC) + # disable warnings in 3rd party code + target_compile_options(sha1 PRIVATE -w) +endif() + add_library(sha256 OBJECT deps/sha256/sha256.c deps/sha256/sha256.h) target_link_libraries(${TARGET} PRIVATE sha256) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt index f63887da7dfca..c407e2f0af44a 100644 --- a/examples/gguf-split/CMakeLists.txt +++ b/examples/gguf-split/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gguf-split) add_executable(${TARGET} gguf-split.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gguf/CMakeLists.txt b/examples/gguf/CMakeLists.txt index a9569b411956b..fb04eb83f34ce 100644 --- a/examples/gguf/CMakeLists.txt +++ b/examples/gguf/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gguf) add_executable(${TARGET} gguf.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/gritlm/CMakeLists.txt b/examples/gritlm/CMakeLists.txt index 86dfddca346fe..fa1b4dc70c2f6 100644 --- a/examples/gritlm/CMakeLists.txt +++ b/examples/gritlm/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-gritlm) add_executable(${TARGET} gritlm.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt index d4c8265bdb9d2..412696c47c31c 100644 --- a/examples/imatrix/CMakeLists.txt +++ b/examples/imatrix/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-imatrix) add_executable(${TARGET} imatrix.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index 70ff47768c02b..45206f4a7dee1 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -637,10 +637,19 @@ int main(int argc, char ** argv) { LOG_INF("%s\n", common_params_get_system_info(params).c_str()); } - if (!compute_imatrix(ctx, params)) { - return 1; + if (params.prompt.empty()) { + if (params.in_files.empty()) { + LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n"); + return 1; + } + LOG_INF("No prompt provided; combining precomputed matrices only.\n"); + } else { + if (!compute_imatrix(ctx, params)) { + return 1; + } } + g_collector.save_imatrix(); LOG("\n"); diff --git a/examples/infill/CMakeLists.txt b/examples/infill/CMakeLists.txt index 9b1aa3b63c920..fb26628d82bf9 100644 --- a/examples/infill/CMakeLists.txt +++ b/examples/infill/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-infill) add_executable(${TARGET} infill.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/llama-bench/CMakeLists.txt b/examples/llama-bench/CMakeLists.txt index 5bdbea4e28187..17e3b9b87bae4 100644 --- a/examples/llama-bench/CMakeLists.txt +++ b/examples/llama-bench/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-bench) add_executable(${TARGET} llama-bench.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3dc84a75cbec7..bac606f471639 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // initialize backends + ggml_backend_load_all(); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_threadpool_free(threadpool); + ggml_threadpool_free_fn(threadpool); } llama_free_model(lmodel); diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt index bbf5fec586feb..5d32f377f2637 100644 --- a/examples/llava/CMakeLists.txt +++ b/examples/llava/CMakeLists.txt @@ -11,7 +11,7 @@ target_include_directories(llava PUBLIC .) target_include_directories(llava PUBLIC ../..) target_include_directories(llava PUBLIC ../../common) -target_compile_features(llava PRIVATE cxx_std_11) +target_compile_features(llava PRIVATE cxx_std_17) add_library(llava_static STATIC $) if (BUILD_SHARED_LIBS) @@ -35,11 +35,11 @@ add_executable(${TARGET} llava-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-minicpmv-cli) add_executable(${TARGET} minicpmv-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index aae49c965e905..7ba4cea58e80b 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -40,10 +40,17 @@ #include #include -#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +#if defined(LLAVA_LOG_OFF) +# define LOG_INF(...) +# define LOG_WRN(...) +# define LOG_ERR(...) +# define LOG_DBG(...) +#else // defined(LLAVA_LOG_OFF) +# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#endif // defined(LLAVA_LOG_OFF) //#define CLIP_DEBUG_FUNCTIONS diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index be69885408433..4ca53a0b883b9 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -11,13 +11,17 @@ #include #include -#define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) -#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) - -#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) -#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) -#define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#if defined(LLAVA_LOG_OFF) +# define LOG_INF(...) +# define LOG_WRN(...) +# define LOG_ERR(...) +# define LOG_DBG(...) +#else // defined(LLAVA_LOG_OFF) +# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0) +# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0) +#endif // defined(LLAVA_LOG_OFF) // RGB uint8 image struct clip_image_u8 { @@ -498,10 +502,16 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long errno = 0; size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer if (ferror(file)) { - die_fmt("read error: %s", strerror(errno)); + LOG_ERR("read error: %s", strerror(errno)); + free(buffer); + fclose(file); + return false; } if (ret != (size_t) fileSize) { - die("unexpectedly reached end of file"); + LOG_ERR("unexpectedly reached end of file"); + free(buffer); + fclose(file); + return false; } fclose(file); // Close the file diff --git a/examples/lookahead/CMakeLists.txt b/examples/lookahead/CMakeLists.txt index f0ae5cd89244c..3468613142de0 100644 --- a/examples/lookahead/CMakeLists.txt +++ b/examples/lookahead/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-lookahead) add_executable(${TARGET} lookahead.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index ef19fe25e31a3..fba78ceda6fd7 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -2,22 +2,22 @@ set(TARGET llama-lookup) add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-create) add_executable(${TARGET} lookup-create.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-merge) add_executable(${TARGET} lookup-merge.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) set(TARGET llama-lookup-stats) add_executable(${TARGET} lookup-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index 3b38db292320f..5563f4de0a504 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -29,4 +29,4 @@ add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp) target_include_directories(${TARGET} PRIVATE ${_common_path}) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index 5f6efaa9aa94b..af3d9150f8640 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-cli) add_executable(${TARGET} main.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 957451af7ce0a..d0c28f317b8c5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -165,6 +165,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -174,7 +178,7 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new(&tpp_batch); + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); return 1; @@ -184,7 +188,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); return 1; @@ -890,8 +894,8 @@ int main(int argc, char ** argv) { llama_backend_free(); - ggml_threadpool_free(threadpool); - ggml_threadpool_free(threadpool_batch); + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } diff --git a/examples/parallel/CMakeLists.txt b/examples/parallel/CMakeLists.txt index c13557bac2bac..847e916de6ed8 100644 --- a/examples/parallel/CMakeLists.txt +++ b/examples/parallel/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-parallel) add_executable(${TARGET} parallel.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/passkey/CMakeLists.txt b/examples/passkey/CMakeLists.txt index dc467a5d3e411..9bc5110c29309 100644 --- a/examples/passkey/CMakeLists.txt +++ b/examples/passkey/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-passkey) add_executable(${TARGET} passkey.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index be0f2fd029e67..3e68640933afb 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-perplexity) add_executable(${TARGET} perplexity.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt index bb986a716883d..9a3a0d3cd2dee 100644 --- a/examples/quantize-stats/CMakeLists.txt +++ b/examples/quantize-stats/CMakeLists.txt @@ -3,4 +3,4 @@ add_executable(${TARGET} quantize-stats.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 62680cda4455f..47e5cbe30cfe3 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -3,4 +3,4 @@ add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt index 66610f3111405..512a602ec045c 100644 --- a/examples/retrieval/CMakeLists.txt +++ b/examples/retrieval/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-retrieval) add_executable(${TARGET} retrieval.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/CMakeLists.txt b/examples/run/CMakeLists.txt new file mode 100644 index 0000000000000..52add51ef77c3 --- /dev/null +++ b/examples/run/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-run) +add_executable(${TARGET} run.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/run/README.md b/examples/run/README.md new file mode 100644 index 0000000000000..6e926811f3cff --- /dev/null +++ b/examples/run/README.md @@ -0,0 +1,7 @@ +# llama.cpp/example/run + +The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models. + +```bash +./llama-run Meta-Llama-3.1-8B-Instruct.gguf +... diff --git a/examples/run/run.cpp b/examples/run/run.cpp new file mode 100644 index 0000000000000..cac2faefcc256 --- /dev/null +++ b/examples/run/run.cpp @@ -0,0 +1,409 @@ +#if defined(_WIN32) +#include +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llama-cpp.h" + +typedef std::unique_ptr char_array_ptr; + +struct Argument { + std::string flag; + std::string help_text; +}; + +struct Options { + std::string model_path, prompt_non_interactive; + int ngl = 99; + int n_ctx = 2048; +}; + +class ArgumentParser { + public: + ArgumentParser(const char * program_name) : program_name(program_name) {} + + void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") { + string_args[flag] = &var; + arguments.push_back({flag, help_text}); + } + + void add_argument(const std::string & flag, int & var, const std::string & help_text = "") { + int_args[flag] = &var; + arguments.push_back({flag, help_text}); + } + + int parse(int argc, const char ** argv) { + for (int i = 1; i < argc; ++i) { + std::string arg = argv[i]; + if (string_args.count(arg)) { + if (i + 1 < argc) { + *string_args[arg] = argv[++i]; + } else { + fprintf(stderr, "error: missing value for %s\n", arg.c_str()); + print_usage(); + return 1; + } + } else if (int_args.count(arg)) { + if (i + 1 < argc) { + if (parse_int_arg(argv[++i], *int_args[arg]) != 0) { + fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]); + print_usage(); + return 1; + } + } else { + fprintf(stderr, "error: missing value for %s\n", arg.c_str()); + print_usage(); + return 1; + } + } else { + fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str()); + print_usage(); + return 1; + } + } + + if (string_args["-m"]->empty()) { + fprintf(stderr, "error: -m is required\n"); + print_usage(); + return 1; + } + + return 0; + } + + private: + const char * program_name; + std::unordered_map string_args; + std::unordered_map int_args; + std::vector arguments; + + int parse_int_arg(const char * arg, int & value) { + char * end; + const long val = std::strtol(arg, &end, 10); + if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) { + value = static_cast(val); + return 0; + } + return 1; + } + + void print_usage() const { + printf("\nUsage:\n"); + printf(" %s [OPTIONS]\n\n", program_name); + printf("Options:\n"); + for (const auto & arg : arguments) { + printf(" %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str()); + } + + printf("\n"); + } +}; + +class LlamaData { + public: + llama_model_ptr model; + llama_sampler_ptr sampler; + llama_context_ptr context; + std::vector messages; + + int init(const Options & opt) { + model = initialize_model(opt.model_path, opt.ngl); + if (!model) { + return 1; + } + + context = initialize_context(model, opt.n_ctx); + if (!context) { + return 1; + } + + sampler = initialize_sampler(); + return 0; + } + + private: + // Initializes the model and returns a unique pointer to it + llama_model_ptr initialize_model(const std::string & model_path, const int ngl) { + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params)); + if (!model) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + } + + return model; + } + + // Initializes the context with the specified parameters + llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = n_ctx; + ctx_params.n_batch = n_ctx; + + llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); + if (!context) { + fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__); + } + + return context; + } + + // Initializes and configures the sampler + llama_sampler_ptr initialize_sampler() { + llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + + return sampler; + } +}; + +// Add a message to `messages` and store its content in `owned_content` +static void add_message(const char * role, const std::string & text, LlamaData & llama_data, + std::vector & owned_content) { + char_array_ptr content(new char[text.size() + 1]); + std::strcpy(content.get(), text.c_str()); + llama_data.messages.push_back({role, content.get()}); + owned_content.push_back(std::move(content)); +} + +// Function to apply the chat template and resize `formatted` if needed +static int apply_chat_template(const LlamaData & llama_data, std::vector & formatted, const bool append) { + int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), + llama_data.messages.size(), append, formatted.data(), formatted.size()); + if (result > static_cast(formatted.size())) { + formatted.resize(result); + result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(), + llama_data.messages.size(), append, formatted.data(), formatted.size()); + } + + return result; +} + +// Function to tokenize the prompt +static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt, + std::vector & prompt_tokens) { + const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true); + prompt_tokens.resize(n_prompt_tokens); + if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, + true) < 0) { + GGML_ABORT("failed to tokenize the prompt\n"); + } + + return n_prompt_tokens; +} + +// Check if we have enough space in the context to evaluate this batch +static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) { + const int n_ctx = llama_n_ctx(ctx.get()); + const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get()); + if (n_ctx_used + batch.n_tokens > n_ctx) { + printf("\033[0m\n"); + fprintf(stderr, "context size exceeded\n"); + return 1; + } + + return 0; +} + +// convert the token to a string +static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) { + char buf[256]; + int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true); + if (n < 0) { + GGML_ABORT("failed to convert token to piece\n"); + } + + piece = std::string(buf, n); + return 0; +} + +static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) { + printf("%s", piece.c_str()); + fflush(stdout); + response += piece; +} + +// helper function to evaluate a prompt and generate a response +static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) { + std::vector prompt_tokens; + const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens); + if (n_prompt_tokens < 0) { + return 1; + } + + // prepare a batch for the prompt + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); + llama_token new_token_id; + while (true) { + check_context_size(llama_data.context, batch); + if (llama_decode(llama_data.context.get(), batch)) { + GGML_ABORT("failed to decode\n"); + } + + // sample the next token, check is it an end of generation? + new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1); + if (llama_token_is_eog(llama_data.model.get(), new_token_id)) { + break; + } + + std::string piece; + if (convert_token_to_string(llama_data.model, new_token_id, piece)) { + return 1; + } + + print_word_and_concatenate_to_response(piece, response); + + // prepare the next batch with the sampled token + batch = llama_batch_get_one(&new_token_id, 1); + } + + return 0; +} + +static int parse_arguments(const int argc, const char ** argv, Options & opt) { + ArgumentParser parser(argv[0]); + parser.add_argument("-m", opt.model_path, "model"); + parser.add_argument("-p", opt.prompt_non_interactive, "prompt"); + parser.add_argument("-c", opt.n_ctx, "context_size"); + parser.add_argument("-ngl", opt.ngl, "n_gpu_layers"); + if (parser.parse(argc, argv)) { + return 1; + } + + return 0; +} + +static int read_user_input(std::string & user) { + std::getline(std::cin, user); + return user.empty(); // Indicate an error or empty input +} + +// Function to generate a response based on the prompt +static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) { + // Set response color + printf("\033[33m"); + if (generate(llama_data, prompt, response)) { + fprintf(stderr, "failed to generate response\n"); + return 1; + } + + // End response with color reset and newline + printf("\n\033[0m"); + return 0; +} + +// Helper function to apply the chat template and handle errors +static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector & formatted, + const bool is_user_input, int & output_length) { + const int new_len = apply_chat_template(llama_data, formatted, is_user_input); + if (new_len < 0) { + fprintf(stderr, "failed to apply the chat template\n"); + return -1; + } + + output_length = new_len; + return 0; +} + +// Helper function to handle user input +static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) { + if (!prompt_non_interactive.empty()) { + user_input = prompt_non_interactive; + return true; // No need for interactive input + } + + printf("\033[32m> \033[0m"); + return !read_user_input(user_input); // Returns false if input ends the loop +} + +// Function to tokenize the prompt +static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) { + std::vector owned_content; + std::vector fmtted(llama_n_ctx(llama_data.context.get())); + int prev_len = 0; + + while (true) { + // Get user input + std::string user_input; + if (!handle_user_input(user_input, prompt_non_interactive)) { + break; + } + + add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data, + owned_content); + + int new_len; + if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) { + return 1; + } + + std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len); + std::string response; + if (generate_response(llama_data, prompt, response)) { + return 1; + } + } + return 0; +} + +static void log_callback(const enum ggml_log_level level, const char * text, void *) { + if (level == GGML_LOG_LEVEL_ERROR) { + fprintf(stderr, "%s", text); + } +} + +static bool is_stdin_a_terminal() { +#if defined(_WIN32) + HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdin, &mode); +#else + return isatty(STDIN_FILENO); +#endif +} + +static std::string read_pipe_data() { + std::ostringstream result; + result << std::cin.rdbuf(); // Read all data from std::cin + return result.str(); +} + +int main(int argc, const char ** argv) { + Options opt; + if (parse_arguments(argc, argv, opt)) { + return 1; + } + + if (!is_stdin_a_terminal()) { + if (!opt.prompt_non_interactive.empty()) { + opt.prompt_non_interactive += "\n\n"; + } + + opt.prompt_non_interactive += read_pipe_data(); + } + + llama_log_set(log_callback, nullptr); + LlamaData llama_data; + if (llama_data.init(opt)) { + return 1; + } + + if (chat_loop(llama_data, opt.prompt_non_interactive)) { + return 1; + } + + return 0; +} diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index 0fb5e359bc9ad..0f50e50deecd7 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET llama-save-load-state) add_executable(${TARGET} save-load-state.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 93e876f5a5524..e82f9153303b3 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -50,4 +50,4 @@ if (WIN32) TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) endif() -target_compile_features(${TARGET} PRIVATE cxx_std_11) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/server/README.md b/examples/server/README.md index 0936e0b7ba13a..877768c8b0bd2 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -412,7 +412,7 @@ node index.js `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` - `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false` + `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. diff --git a/examples/server/public/index.html b/examples/server/public/index.html index 6216c08410a28..c54260867d2c8 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -81,7 +81,13 @@

Conversations

- +