|
| 1 | + |
| 2 | +20250226151422, tok/s=133.29, tok/s_decode=134.40, ttft=0.0118, mem/s=2000.68 GB/s, peak_mem=16.30 GB, model_size=15.01 GB quant: None, sparse: None, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 3 | +20250226151926, tok/s=242.08, tok/s_decode=256.68, ttft=0.0464, mem/s=1182.14 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 4 | +20250226152416, tok/s=252.18, tok/s_decode=267.48, ttft=0.0448, mem/s=1229.49 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 5 | +20250226153215, tok/s=204.19, tok/s_decode=213.86, ttft=0.0438, mem/s=1226.65 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 6 | +20250226153628, tok/s=180.14, tok/s_decode=187.54, ttft=0.0433, mem/s=1081.56 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 7 | +20250226160622, tok/s=246.20, tok/s_decode=255.21, ttft=0.0281, mem/s= 956.89 GB/s, peak_mem= 5.56 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 8 | +20250226160651, tok/s=145.07, tok/s_decode=163.13, ttft=0.1522, mem/s=1461.87 GB/s, peak_mem=22.76 GB, model_size=10.08 GB quant: None, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 9 | + |
| 10 | +20250226161533, tok/s=250.71, tok/s_decode=254.78, ttft=0.0121, mem/s= 974.38 GB/s, peak_mem= 5.56 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 11 | +20250226161913, tok/s=251.19, tok/s_decode=254.95, ttft=0.0112, mem/s= 976.26 GB/s, peak_mem= 5.63 GB, model_size= 3.89 GB quant: sparse-marlin, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.float16, device: cuda repro: python generate.py --quantization sparse-marlin --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 12 | +20250226181326, tok/s=134.44, tok/s_decode=140.82, ttft=0.0669, mem/s= 807.62 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 13 | +20250226181520, tok/s=138.03, tok/s_decode=164.08, ttft=0.2295, mem/s=1390.97 GB/s, peak_mem=22.74 GB, model_size=10.08 GB quant: None, sparse: semi-structured, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.float16, device: cuda repro: python generate.py --sparsity semi-structured --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.float16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 14 | +20250226181738, tok/s=192.65, tok/s_decode=205.62, ttft=0.0649, mem/s=1157.32 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 15 | +20250226182045, tok/s=192.75, tok/s_decode=206.24, ttft=0.0673, mem/s=1157.27 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 16 | +20250226182350, tok/s=236.36, tok/s_decode=257.62, ttft=0.0693, mem/s=1154.19 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 17 | +20250226182712, tok/s=231.24, tok/s_decode=250.55, ttft=0.0661, mem/s=1127.37 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 18 | +20250226183255, tok/s=169.58, tok/s_decode=179.82, ttft=0.0665, mem/s=1018.74 GB/s, peak_mem= 8.27 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 19 | +20250226183527, tok/s=184.74, tok/s_decode=196.38, ttft=0.0637, mem/s=1109.18 GB/s, peak_mem= 8.26 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 20 | +20250226183734, tok/s=232.60, tok/s_decode=252.51, ttft=0.0673, mem/s=1135.85 GB/s, peak_mem= 6.74 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 21 | +20250226183953, tok/s=232.47, tok/s_decode=251.15, ttft=0.0635, mem/s=1133.40 GB/s, peak_mem= 6.73 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: False, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 22 | +20250227084325, tok/s=200.72, tok/s_decode=210.91, ttft=0.0475, mem/s=1205.82 GB/s, peak_mem= 8.00 GB, model_size= 6.01 GB quant: None, sparse: bsr-0.8-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 23 | +20250227084708, tok/s=211.76, tok/s_decode=222.43, ttft=0.0447, mem/s=1271.42 GB/s, peak_mem= 7.99 GB, model_size= 6.00 GB quant: None, sparse: bsr-0.8-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.8-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 24 | +20250227085051, tok/s=241.09, tok/s_decode=255.19, ttft=0.0452, mem/s=1177.31 GB/s, peak_mem= 6.47 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-32, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-32 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 25 | +20250227085446, tok/s=247.53, tok/s_decode=262.94, ttft=0.0468, mem/s=1206.80 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 26 | +20250227090411, tok/s=250.11, tok/s_decode=263.99, ttft=0.0416, mem/s=1219.39 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
| 27 | +20250227091144, tok/s=249.14, tok/s_decode=263.74, ttft=0.0439, mem/s=1214.68 GB/s, peak_mem= 6.46 GB, model_size= 4.88 GB quant: None, sparse: bsr-0.9-64, mod: Meta-Llama-3.1-8B, kv_quant: False, compile: True, compile_prefill: True, dtype: torch.bfloat16, device: cuda repro: python generate.py --sparsity bsr-0.9-64 --checkpoint_path ../../../checkpoints/meta-llama/Meta-Llama-3.1-8B/model.pth --device cuda --precision torch.bfloat16 --compile --compile_prefill --num_samples 5 --max_new_tokens 200 --batch_size 1 --top_k 200 --temperature 0.8 |
0 commit comments