Skip to content

Commit 3e318e7

Browse files
committed
Merge branch 'master' into gg/flash-attn
2 parents 57c03b7 + 5106ef4 commit 3e318e7

File tree

21 files changed

+1314
-456
lines changed

21 files changed

+1314
-456
lines changed

.devops/nix/package.nix

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
useOpenCL
2525
useRocm
2626
useVulkan
27-
],
27+
] && blas.meta.available,
2828
useCuda ? config.cudaSupport,
2929
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
3030
useMpi ? false, # Increases the runtime closure size by ~700M
@@ -67,10 +67,15 @@ let
6767
strings.optionalString (suffices != [ ])
6868
", accelerated with ${strings.concatStringsSep ", " suffices}";
6969

70+
executableSuffix = effectiveStdenv.hostPlatform.extensions.executable;
71+
7072
# TODO: package the Python in this repository in a Nix-like way.
7173
# It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
7274
# is PEP 517-compatible, and ensure the correct .dist-info is generated.
7375
# https://peps.python.org/pep-0517/
76+
#
77+
# TODO: Package up each Python script or service appropriately, by making
78+
# them into "entrypoints"
7479
llama-python = python3.withPackages (
7580
ps: [
7681
ps.numpy
@@ -159,11 +164,6 @@ effectiveStdenv.mkDerivation (
159164
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
160165
substituteInPlace ./ggml-metal.m \
161166
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
162-
163-
# TODO: Package up each Python script or service appropriately.
164-
# If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
165-
# we could make those *.py into setuptools' entrypoints
166-
substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
167167
'';
168168

169169
# With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
@@ -244,8 +244,8 @@ effectiveStdenv.mkDerivation (
244244
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
245245
# if they haven't been added yet.
246246
postInstall = ''
247-
mv $out/bin/main $out/bin/llama
248-
mv $out/bin/server $out/bin/llama-server
247+
mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix}
248+
mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix}
249249
mkdir -p $out/include
250250
cp $src/llama.h $out/include/
251251
'';

.github/workflows/bench.yml

Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
# Benchmark
2+
name: Benchmark
3+
4+
on:
5+
workflow_dispatch:
6+
inputs:
7+
gpu-series:
8+
description: 'Azure GPU series to run with'
9+
required: true
10+
type: choice
11+
options:
12+
- Standard_NC4as_T4_v3
13+
- Standard_NC24ads_A100_v4
14+
- Standard_NC80adis_H100_v5
15+
sha:
16+
description: 'Commit SHA1 to build'
17+
required: false
18+
type: string
19+
duration:
20+
description: 'Duration of the bench'
21+
type: string
22+
default: 10m
23+
24+
push:
25+
branches:
26+
- master
27+
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
28+
pull_request:
29+
types: [opened, synchronize, reopened]
30+
paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
31+
schedule:
32+
- cron: '04 2 * * *'
33+
34+
concurrency:
35+
group: ${{ github.workflow }}-${{ github.ref }}
36+
cancel-in-progress: true
37+
38+
jobs:
39+
bench-server-baseline:
40+
runs-on: Standard_NC4as_T4_v3
41+
env:
42+
RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
43+
N_USERS: 8
44+
DURATION: 10m
45+
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
46+
steps:
47+
- name: Clone
48+
id: checkout
49+
uses: actions/checkout@v3
50+
with:
51+
fetch-depth: 0
52+
ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
53+
54+
- name: Install python env
55+
id: pipenv
56+
run: |
57+
cd examples/server/bench
58+
python3 -m venv venv
59+
source venv/bin/activate
60+
pip install -r requirements.txt
61+
62+
- name: Prometheus
63+
id: install_prometheus
64+
run: |
65+
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
66+
tar xzf prometheus*.tar.gz --strip-components=1
67+
./prometheus --config.file=examples/server/bench/prometheus.yml &
68+
while ! nc -z localhost 9090; do
69+
sleep 0.1
70+
done
71+
72+
- name: Install k6
73+
id: k6_installation
74+
run: |
75+
cd examples/server/bench
76+
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
77+
tar xzf k6*.tar.gz --strip-components=1
78+
79+
- name: Build
80+
id: cmake_build
81+
run: |
82+
set -eux
83+
mkdir build
84+
cd build
85+
cmake .. \
86+
-DLLAMA_NATIVE=OFF \
87+
-DLLAMA_BUILD_SERVER=ON \
88+
-DLLAMA_CURL=ON \
89+
-DLLAMA_CUBLAS=ON \
90+
-DCUDAToolkit_ROOT=/usr/local/cuda \
91+
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
92+
-DCMAKE_CUDA_ARCHITECTURES=75 \
93+
-DLLAMA_FATAL_WARNINGS=OFF \
94+
-DLLAMA_ALL_WARNINGS=OFF \
95+
-DCMAKE_BUILD_TYPE=Release;
96+
cmake --build . --config Release -j $(nproc) --target server
97+
98+
- name: Download the dataset
99+
id: download_dataset
100+
run: |
101+
cd examples/server/bench
102+
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
103+
104+
- name: Server bench
105+
id: server_bench
106+
run: |
107+
set -eux
108+
109+
cd examples/server/bench
110+
source venv/bin/activate
111+
BENCH_K6_BIN_PATH=./k6 python bench.py \
112+
--runner-label ${{ env.RUNNER_LABEL }} \
113+
--name ${{ github.job }} \
114+
--branch ${{ github.head_ref || github.ref_name }} \
115+
--commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
116+
--scenario script.js \
117+
--duration ${{ github.event.inputs.duration || env.DURATION }} \
118+
--hf-repo ggml-org/models \
119+
--hf-file phi-2/ggml-model-q4_0.gguf \
120+
--model-path-prefix /models \
121+
--parallel ${{ env.N_USERS }} \
122+
-ngl 33 \
123+
--batch-size 2048 \
124+
--ubatch-size 256 \
125+
--ctx-size 16384 \
126+
--n-prompts 1000 \
127+
--max-prompt-tokens 1024 \
128+
--max-tokens 2048
129+
130+
cat results.github.env >> $GITHUB_ENV
131+
132+
# Remove dataset as we do not want it in the artefact
133+
rm ShareGPT_V3_unfiltered_cleaned_split.json
134+
135+
- uses: actions/upload-artifact@v4
136+
with:
137+
name: benchmark-results
138+
compression-level: 9
139+
path: |
140+
examples/server/bench/*.jpg
141+
examples/server/bench/*.json
142+
examples/server/bench/*.log
143+
144+
- name: Commit status
145+
uses: Sibz/github-status-action@v1
146+
continue-on-error: true # If not authorized on external repo
147+
with:
148+
authToken: ${{secrets.GITHUB_TOKEN}}
149+
sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
150+
context: bench-server-baseline
151+
description: |
152+
${{ env.BENCH_RESULTS }}
153+
state: 'success'
154+
155+
- name: Upload benchmark images
156+
uses: devicons/[email protected]
157+
continue-on-error: true # Important as it looks unstable: 503
158+
id: imgur_step
159+
with:
160+
client_id: ${{secrets.IMGUR_CLIENT_ID}}
161+
path: |
162+
examples/server/bench/prompt_tokens_seconds.jpg
163+
examples/server/bench/predicted_tokens_seconds.jpg
164+
examples/server/bench/kv_cache_usage_ratio.jpg
165+
examples/server/bench/requests_processing.jpg
166+
167+
- name: Extract mermaid
168+
id: set_mermaid
169+
run: |
170+
set -eux
171+
172+
cd examples/server/bench
173+
PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
174+
echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
175+
echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
176+
echo "EOF" >> $GITHUB_ENV
177+
178+
PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
179+
echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
180+
echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
181+
echo "EOF" >> $GITHUB_ENV
182+
183+
KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
184+
echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
185+
echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
186+
echo "EOF" >> $GITHUB_ENV
187+
188+
REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
189+
echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
190+
echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
191+
echo "EOF" >> $GITHUB_ENV
192+
193+
- name: Extract image url
194+
id: extract_image_url
195+
continue-on-error: true
196+
run: |
197+
set -eux
198+
199+
echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
200+
echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
201+
echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
202+
echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
203+
204+
- name: Comment PR
205+
uses: mshick/add-pr-comment@v2
206+
id: comment_pr
207+
if: ${{ github.event.pull_request != '' }}
208+
with:
209+
message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
210+
message: |
211+
📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
212+
213+
- Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
214+
- HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
215+
- Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
216+
- Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
217+
- ${{ env.BENCH_GRAPH_XLABEL }}
218+
219+
<details>
220+
221+
<summary>Time series</summary>
222+
223+
<p align="center">
224+
225+
<img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
226+
227+
<details>
228+
229+
<summary>More</summary>
230+
231+
```mermaid
232+
${{ env.PROMPT_TOKENS_SECONDS }}
233+
```
234+
235+
</details>
236+
237+
<img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
238+
239+
<details>
240+
<summary>More</summary>
241+
242+
```mermaid
243+
${{ env.PREDICTED_TOKENS_SECONDS }}
244+
```
245+
246+
</details>
247+
248+
</p>
249+
250+
<details>
251+
252+
<summary>Details</summary>
253+
254+
<p align="center">
255+
256+
<img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
257+
258+
<details>
259+
<summary>More</summary>
260+
261+
```mermaid
262+
${{ env.KV_CACHE_USAGE_RATIO }}
263+
```
264+
265+
</details>
266+
267+
<img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
268+
269+
<details>
270+
<summary>More</summary>
271+
272+
```mermaid
273+
${{ env.REQUESTS_PROCESSING }}
274+
```
275+
276+
</details>
277+
278+
</p>
279+
</details>
280+
</details>

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -556,6 +556,7 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
556556
endif # LLAMA_CUDA_NO_PEER_COPY
557557
OBJS += ggml-cuda.o
558558
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
559+
559560
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
560561
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
561562

0 commit comments

Comments
 (0)