Add -fa option (#14)

davidz-ampere · web-flow · commit ecbcf6eb9771 · 2025-07-24T13:32:58.000+08:00
* Add -fa option

* limit the accept value for fa to 0,1

---------

Co-authored-by: davidz-ampere &lt;&gt;
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -27,8 +27,9 @@ Provide run.py Python script with following arguments:
 - -b, batch size(s) to benchmark, meaning separate token generation streams handled as a single batch; multiple batch sizes can be provided and they will be treated as separate cases to benchmark
 - -p, prompt size(s) to benchmark, size of an input prompt; multiple prompt sizes can be provided and they will be treated as separate cases to benchmark
 - -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63)
+- -fa, 0/1, disable/enable flash attention, default: 0
 ```bash
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79 -fa 1
 ```
 
 ## Quick run on 80t OCI A1 system
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -12,7 +12,7 @@ def get_file_dir():
 
 
 def docker_init(node):
-    tag = "amperecomputingai/llama.cpp:2.2.1"
+    tag = "amperecomputingai/llama.cpp:3.1.2"
     if subprocess.run(
             ["docker", "pull", tag]).returncode != 0:
         print("Docker pull process failed!")
@@ -66,6 +66,10 @@ def benchmark(docker_container_name, args):
     
                     cmd = (f"cd /runner; python3 utils/benchmark.py -m models/{model} -n {str(num_processes)} "
                            f"-t {str(num_threads)} -b {str(batch_size)} -p {str(prompt_size)} -r {args.threads_range}")
+
+                    if args.fa != 0 :
+                        cmd += " -fa 1"
+
                     cmd = ["docker", "exec", "-i", docker_container_name, "bash", "-c", cmd]
     
                     print(f"Executing: {' '.join(cmd)}")
@@ -109,6 +113,9 @@ def parse_args():
     parser.add_argument("-n", "--numa",
                         type=int, default=0,
                         help="numa mode of the docker container")
+    parser.add_argument("-fa",
+                        type=int, default=0, choices=range(0, 2),
+                        help="enable flash attention")
 
     return parser.parse_args()
 
diff --git a/benchmarks/run.sh b/benchmarks/run.sh
@@ -1,4 +1,4 @@
 set -e
 
-python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79
+python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79 -fa 1
 rm -f /tmp/log_power
diff --git a/benchmarks/utils/benchmark.py b/benchmarks/utils/benchmark.py
@@ -6,8 +6,6 @@
 import argparse
 import subprocess
 
-TOKENS = 256
-
 online_threads = None
 
 
@@ -22,6 +20,9 @@ def parse_args():
     parser.add_argument("-p", "--prompt_size",
                         type=int, required=True,
                         help="prompt size to feed the model with")
+    parser.add_argument("-tg", "--tg_size",
+                        type=int, default=256,
+                        help="output token generated from the model")
     parser.add_argument("-r", "--threads_range",
                         type=str, required=True,
                         help="range of threads to use, e.g. '0-63,128-191', threads will be divided between processes "
@@ -38,6 +39,9 @@ def parse_args():
     parser.add_argument("--mp",
                         type=str, default="local",
                         help="memory placement policy, 'local','interleave' or 'none'")
+    parser.add_argument("-fa",
+                        type=int, default=0, choices=range(0,2),
+                        help="enable flash attention")
     return parser.parse_args()
 
 
@@ -71,20 +75,20 @@ def summarize_results(logs_dir, args, start, finish):
         prompt_size = int(results[1])
         assert prompt_size == args.prompt_size
         tokens_generated = int(results[2])
-        assert tokens_generated == TOKENS
+        assert tokens_generated == args.tg_size
         batch_size = int(results[3])
         assert batch_size == args.batch_size
         ttfts.append(float(results[5]))
         tg_lats.append(float(results[7]))
 
     pp_throughput = sum([args.batch_size * args.prompt_size / ttft for ttft in ttfts])
     avg_pp_latency = sum(ttfts) / len(ttfts)
-    tg_throughput = sum([args.batch_size * TOKENS / lat for lat in tg_lats])
-    tg_per_token_lats = [lat / TOKENS for lat in tg_lats]
+    tg_throughput = sum([args.batch_size * args.tg_size / lat for lat in tg_lats])
+    tg_per_token_lats = [lat / args.tg_size for lat in tg_lats]
     avg_tg_latency = sum(tg_per_token_lats) / len(tg_per_token_lats)
-    avg_total_speed = args.num_processes * args.batch_size * (args.prompt_size + TOKENS) / max([ttft + tg_lat for ttft, tg_lat in zip(ttfts, tg_lats)])
+    avg_total_speed = args.num_processes * args.batch_size * (args.prompt_size + args.tg_size) / max([ttft + tg_lat for ttft, tg_lat in zip(ttfts, tg_lats)])
 
-    results_filename = f"{args.model.split('/')[-1]}@PP{str(args.prompt_size)}@TG{str(TOKENS)}.csv"
+    results_filename = f"{args.model.split('/')[-1]}@PP{str(args.prompt_size)}@TG{str(args.tg_size)}@fa{str(args.fa)}@ctx{str(args.kv_cache)}.csv"
     if os.path.exists(results_filename):
         first_write = False
     else:
@@ -96,7 +100,7 @@ def summarize_results(logs_dir, args, start, finish):
                 ["n_proc", "n_threads", "batch_size", "prompt_size", "output_tokens", "pp_throughput_tps",
                  "pp_avg_latency_sec", "tg_throughput_tps", "tg_avg_latency_sec", "pp+tg_throughput_tps", "concurrency", "start", "finish"])
         writer.writerow(
-            [args.num_processes, args.num_threads, args.batch_size, args.prompt_size, TOKENS, f"{pp_throughput:.3f}",
+            [args.num_processes, args.num_threads, args.batch_size, args.prompt_size, args.tg_size, f"{pp_throughput:.3f}",
              f"{avg_pp_latency:.3f}", f"{tg_throughput:.3f}", f"{avg_tg_latency:.3f}", f"{avg_total_speed:.3f}", args.batch_size * args.num_processes, f"{start:.3f}", f"{finish:.3f}"])
 
     print(f"Result saved in {results_filename}")
@@ -131,23 +135,26 @@ def main():
             # command-line for v1
             if mem_place == "none":
                 cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
-                       "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
+                       "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(args.tg_size),
                        str(args.batch_size), str(args.num_threads)]
             else:
                  cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}", str(mem_place),
-                       "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(TOKENS),
+                       "/llm/batched-bench", args.model, str(args.kv_cache), "2048", "512", "0", "0", "0", str(args.prompt_size), str(args.tg_size),
                        str(args.batch_size), str(args.num_threads)]
         elif os.path.exists("/llm/llama-batched-bench"):
             # command-line for v2
             if mem_place == "none":
                 cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",
-                       "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
+                       "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(args.tg_size),
                        "-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "--no-mmap"]
             else:
                 cmd = ["numactl", f"--physcpubind={gen_threads_config(args.num_threads, n)}",str(mem_place),
-                       "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(TOKENS),
+                       "/llm/llama-batched-bench", "-m", args.model, "-c", str(args.kv_cache), "-b", "2048", "-ub", "512", "-npp", str(args.prompt_size), "-ntg", str(args.tg_size),
                        "-npl", str(args.batch_size), "-t", str(args.num_threads), "-tb", str(args.num_threads), "--no-mmap"]
 
+            if args.fa != 0 :
+                cmd.append("--flash-attn")
+
         else:
             print("FAIL: batched-bench not found!")
             sys.exit(1)