From 11612419d24ed30ab91d1c9db512a9d497ec607d Mon Sep 17 00:00:00 2001
From: Xiaotong Jiang <jiangxiaotong728@gmail.com>
Date: Thu, 4 Sep 2025 07:32:30 +0000
Subject: [PATCH 1/5] GITBOOK-19: No subject

---
 .../model-recipes/gpt-oss/README.md           |  2 +-
 .../model-recipes/gpt-oss/usage-guide.md      | 31 +++++++++++++++++--
 2 files changed, 30 insertions(+), 3 deletions(-)
diff --git a/sglang-cookbook/model-recipes/gpt-oss/README.md b/sglang-cookbook/model-recipes/gpt-oss/README.md
index 6ebeb01..2fe1e5f 100644
--- a/sglang-cookbook/model-recipes/gpt-oss/README.md
+++ b/sglang-cookbook/model-recipes/gpt-oss/README.md
@@ -2,7 +2,7 @@
 
 gpt-oss-20b
 
-<table><thead><tr><th>Weight Type</th><th width="249">Hardware Configuration</th><th data-type="content-ref">Instruction</th><th data-type="content-ref">Benchmark</th></tr></thead><tbody><tr><td>MXFP4<br><em>(recommended)</em></td><td>1 x H100/H200</td><td><a href="usage-guide.md#serving-with-1-x-h100-h200">#serving-with-1-x-h100-h200</a></td><td></td></tr><tr><td></td><td>1 x B200</td><td><a href="usage-guide.md#serving-with-1-x-b200">#serving-with-1-x-b200</a></td><td></td></tr><tr><td></td><td>1 x MI300X</td><td></td><td></td></tr><tr><td><strong>Full precision</strong> FP8/BF16</td><td>1 x H200</td><td></td><td></td></tr></tbody></table>
+<table><thead><tr><th>Weight Type</th><th width="249">Hardware Configuration</th><th data-type="content-ref">Instruction</th><th data-type="content-ref">Benchmark</th></tr></thead><tbody><tr><td>MXFP4<br><em>(recommended)</em></td><td>1 x H100/H200</td><td><a href="usage-guide.md#serving-with-1-x-h100-h200">#serving-with-1-x-h100-h200</a></td><td><a href="usage-guide.md#benchmark">#benchmark</a></td></tr><tr><td></td><td>1 x B200</td><td><a href="usage-guide.md#serving-with-1-x-b200">#serving-with-1-x-b200</a></td><td></td></tr><tr><td></td><td>1 x MI300X</td><td></td><td></td></tr><tr><td><strong>Full precision</strong> FP8/BF16</td><td>1 x H200</td><td></td><td></td></tr></tbody></table>
 
 gpt-oss-120b
 
diff --git a/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md b/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md
index 7511784..bc13f92 100644
--- a/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md
+++ b/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md
@@ -2,8 +2,15 @@
 
 ### <mark style="background-color:green;">Serving with 1 x H100/H200</mark>
 
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 {% code overflow="wrap" %}
 ```bash
@@ -18,6 +25,26 @@ python3 -m sglang.launch_server --model-path openai/gpt-oss-20b
 python3 -m sglang.launch_server --model-path openai/gpt-oss-120b --mem-fraction-static 0.95
 ```
 {% endcode %}
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+SGLang version (0.5.1)
+
+<pre class="language-bash" data-overflow="wrap"><code class="lang-bash"><strong># gpt-oss-20b
+</strong>python -m sglang.bench_one_batch_server --base-url http://127.0.0.1:30000  --model-path openai/gpt-oss-20b --batch 1 --input-len 1024 --output-len 1024 
+</code></pre>
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td>1/1024/1024</td><td>0.05</td><td>3.29</td><td>22668.19</td><td>304.59</td></tr><tr><td>1/8192/1024</td><td>0.15</td><td>3.39</td><td>55870.90</td><td>295.09</td></tr><tr><td>8/1024/1024</td><td>0.12</td><td>5.92</td><td>65760.01</td><td>1350.83</td></tr><tr><td>8/8192/1024</td><td>1.05</td><td>6.62</td><td>62209.72</td><td>1209.10</td></tr></tbody></table>
+
+<pre class="language-bash" data-overflow="wrap"><code class="lang-bash"><strong># gpt-oss-120b
+</strong>python -m sglang.bench_one_batch_server --base-url http://127.0.0.1:30000  --model-path openai/gpt-oss-120b --batch 1 --input-len 1024 --output-len 1024 
+</code></pre>
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td>1/1024/1024</td><td>0.07</td><td>4.73</td><td>15803.59</td><td>211.49</td></tr><tr><td>1/8192/1024</td><td>0.23</td><td>4.89</td><td>35004.05</td><td>204.75</td></tr><tr><td>8/1024/1024</td><td>0.21</td><td>10.17</td><td>39132.98</td><td>786.63</td></tr><tr><td>8/8192/1024</td><td>1.76</td><td>11.20</td><td>37178.23</td><td>714.53</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 2 x H100</mark>
 

From ec1f9a18d5086df0b91fcdd143a644e6ddee3145 Mon Sep 17 00:00:00 2001
From: Xiaotong Jiang <jiangxiaotong728@gmail.com>
Date: Thu, 4 Sep 2025 00:43:09 -0700
Subject: [PATCH 2/5] .

---
 .../deepseek-v3.1-v3-r1/usage-guide.md        | 201 ++++++++++++++++--
 .../model-recipes/gpt-oss/usage-guide.md      |  42 +++-
 2 files changed, 219 insertions(+), 24 deletions(-)

diff --git a/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md b/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md
index 3973d23..d9b9c86 100644
--- a/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md
+++ b/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md
@@ -2,10 +2,17 @@
 
 ### <mark style="background-color:green;">Serving with 1 x 8 x H200</mark>
 
-1.  Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
+{% stepper %}
+{% step %}
+### Install SGLang
 
-    Note if you are using RDMA and are using docker, `--network host` and `--privileged` are required for `docker run` command.
-2. Serve the model
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
+
+Note if you are using RDMA and are using docker, `--network host` and `--privileged` are required for `docker run` command.
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 {% code overflow="wrap" %}
 ```bash
@@ -15,11 +22,26 @@ python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-r
 
 * You may need to set `NCCL_IB_GID_INDEX` if you are using RoCE, for example: `export NCCL_IB_GID_INDEX=3`.
 * [Optional Optimization Options](./#optional-performance-optimization)
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 1 x 8 x MI300X</mark>
 
-1. Install SGLang following [the instruction](../installation/amd-gpus.md)
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](../installation/amd-gpus.md)
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 {% code overflow="wrap" %}
 ```bash
@@ -28,11 +50,26 @@ python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-r
 {% endcode %}
 
 [Running DeepSeek-R1 on a single NDv5 MI300X VM](https://techcommunity.microsoft.com/blog/azurehighperformancecomputingblog/running-deepseek-r1-on-a-single-ndv5-mi300x-vm/4372726) could also be a good reference.
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 2 x 8 x H100/800/20</mark>
 
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 2 nodes
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 2 nodes
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 If the first node's IP is `10.0.0.1` , launch the server in both node with below commands
 
@@ -49,11 +86,26 @@ python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --d
 * If the command fails, try setting the `GLOO_SOCKET_IFNAME` parameter. For more information, see [Common Environment Variables](https://pytorch.org/docs/stable/distributed.html#common-environment-variables).
 * If the multi nodes support NVIDIA InfiniBand and encounter hanging issues during startup, consider adding the parameter `export NCCL_IB_GID_INDEX=3`. For more information, see [this](https://github.com/sgl-project/sglang/issues/3516#issuecomment-2668493307).
 * [Optional Optimization Options](./#optional-performance-optimization)
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with Xeon 6980P CPU</mark>
 
-1. Install SGLang following [the instruction](../installation/intel-xeon-cpus.md)
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](../installation/intel-xeon-cpus.md)
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 * For w8a8\_int8
 
@@ -83,11 +135,26 @@ python -m sglang.launch_server                 \
     --max-total-token 65536                    \
     --tp 6
 ```
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 2 x 8 x H200</mark>
 
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 2 nodes
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 2 nodes
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 If the first node's IP is `10.0.0.1` , launch the server in both node with below commands
 
@@ -102,12 +169,32 @@ python3 -m sglang.launch_server --model-path deepseek-ai/DeepSeek-V3 --tp 16 --d
 {% endcode %}
 
 * [Optional Optimization Options](./#optional-performance-optimization)
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 4 x 8 x A100</mark>
 
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 4 nodes
-2. As A100 does not support FP8, we need to convert the [FP8 model checkpoints](https://huggingface.co/deepseek-ai/DeepSeek-V3) to BF16 with [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) mentioned [here](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) first
-3. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 4 nodes
+{% endstep %}
+
+{% step %}
+### Convert Model Checkpoints
+
+As A100 does not support FP8, we need to convert the [FP8 model checkpoints](https://huggingface.co/deepseek-ai/DeepSeek-V3) to BF16 with [script](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) mentioned [here](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/fp8_cast_bf16.py) first
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 If the first node's IP is `10.0.0.1` , and the converted model path is `/path/to/DeepSeek-V3-BF16`, launch the server in 4 nodes with below commands
 
@@ -128,11 +215,31 @@ python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 -
 {% endcode %}
 
 * [Optional Optimization Options](./#optional-performance-optimization)
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 8 x A100</mark>
 
+<<<<<<< HEAD
 1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
 2. Serve the model
+=======
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)&#x20;
+{% endstep %}
+
+{% step %}
+### Serve the model
+>>>>>>> c926237 (.)
 
 {% code overflow="wrap" %}
 ```bash
@@ -151,11 +258,26 @@ python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --
 {% endcode %}
 
 Note that `awq_marlin` only supports `float16` now, which may lead to some precision loss.
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 2 x 8 x A100/A800</mark>
 
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 4 nodes
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 4 nodes
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 There are block-wise and per-channel quantization methods, weights have already been quantized in these huggingface checkpoint:
 
@@ -179,11 +301,26 @@ python3 -m sglang.launch_server \
 {% endcode %}
 
 * [Optional Optimization Options](./#optional-performance-optimization)
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 4 x 8 x L40S nodes</mark>
 
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 4 nodes
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus) for the 4 nodes
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 Running with per-channel quantization model:
 
@@ -211,12 +348,30 @@ python3 -m sglang.launch_server --model meituan/DeepSeek-R1-Channel-INT8 --tp 32
 	--enable-torch-compile --torch-compile-max-bs 32
 ```
 {% endcode %}
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Example: Serving on any cloud or Kubernetes with SkyPilot</mark>
 
+{% stepper %}
+{% step %}
+### Install SkyPilot
+
 SkyPilot helps find cheapest available GPUs across any cloud or existing Kubernetes clusters and launch distributed serving with a single command. See details [here](https://github.com/skypilot-org/skypilot/tree/master/llm/deepseek-r1).
 
-To serve on multiple nodes:
+```bash
+git clone https://github.com/skypilot-org/skypilot.git
+```
+{% endstep %}
+
+{% step %}
+### Serve on multiple nodes
 
 {% code overflow="wrap" %}
 ```bash
@@ -227,3 +382,11 @@ sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B.yaml --retry-until-up
 sky launch -c r1 llm/deepseek-r1/deepseek-r1-671B-A100.yaml --retry-until-up
 ```
 {% endcode %}
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
diff --git a/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md b/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md
index bc13f92..a41b9b5 100644
--- a/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md
+++ b/sglang-cookbook/model-recipes/gpt-oss/usage-guide.md
@@ -48,8 +48,15 @@ SGLang version (0.5.1)
 
 ### <mark style="background-color:green;">Serving with 2 x H100</mark>
 
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
-2. Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFtIWT8LEMaYiYzz0p8P/~/changes/11/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 {% code overflow="wrap" %}
 ```bash
@@ -57,11 +64,26 @@ SGLang version (0.5.1)
 python3 -m sglang.launch_server --model-path openai/gpt-oss-120b --tp 2
 ```
 {% endcode %}
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### <mark style="background-color:green;">Serving with 1 x B200</mark>
 
-* Install SGLang following [the instruction](../installation/nvidia-blackwell-gpus.md)
-* Serve the model
+{% stepper %}
+{% step %}
+### Install SGLang
+
+Following [the instruction](../installation/nvidia-blackwell-gpus.md)
+{% endstep %}
+
+{% step %}
+### Serve the model
 
 {% code overflow="wrap" %}
 ```bash
@@ -76,8 +98,10 @@ python3 -m sglang.launch_server --model-path openai/gpt-oss-20b
 python3 -m sglang.launch_server --model-path openai/gpt-oss-120b
 ```
 {% endcode %}
+{% endstep %}
 
-#### With Speculative Decoding
+{% step %}
+### With Speculative Decoding
 
 {% code overflow="wrap" %}
 ```bash
@@ -97,6 +121,14 @@ python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo E
 python3 -m sglang.launch_server --model openai/gpt-oss-120b --speculative-algo EAGLE3 --speculative-draft lmsys/EAGLE3-gpt-oss-120b-bf16 --speculative-num-steps 5 --speculative-eagle-topk 4 --speculative-num-draft-tokens 8 --attention-backend triton --tp 4
 ```
 {% endcode %}
+{% endstep %}
+
+{% step %}
+### Benchmark
+
+<table><thead><tr><th width="209.78515625">BS/Input/Output Length</th><th width="109.6328125">TTFT(s)</th><th width="101.75390625">ITL(ms)</th><th>Input Throughput</th><th>Output Throughput</th></tr></thead><tbody><tr><td colspan="5" style="text-align: center;">Benchmark results will be added here</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
 
 ### Responses API & Built-in Tools
 

From a600a785abe60b4a403046278dc571420aa40f22 Mon Sep 17 00:00:00 2001
From: Xiaotong Jiang <jiangxiaotong728@gmail.com>
Date: Sun, 7 Sep 2025 09:36:21 -0700
Subject: [PATCH 3/5] .

---
 .../model-recipes/deepseek-v3.1-v3-r1/usage-guide.md         | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md b/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md
index d9b9c86..72fd784 100644
--- a/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md
+++ b/sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md
@@ -226,10 +226,6 @@ python3 -m sglang.launch_server --model-path /path/to/DeepSeek-V3-BF16 --tp 32 -
 
 ### <mark style="background-color:green;">Serving with 8 x A100</mark>
 
-<<<<<<< HEAD
-1. Install SGLang following [the instruction](https://app.gitbook.com/s/FFtIWT8LEMaYiYzz0p8P/sglang-cookbook/installation/nvidia-h-series-a-series-and-rtx-gpus)
-2. Serve the model
-=======
 {% stepper %}
 {% step %}
 ### Install SGLang
@@ -239,7 +235,6 @@ Following [the instruction](https://app.gitbook.com/o/TvLfyTxdRQeudJH7e5QW/s/FFt
 
 {% step %}
 ### Serve the model
->>>>>>> c926237 (.)
 
 {% code overflow="wrap" %}
 ```bash

From 5b4e163f6b1ea4ef29bb47ca4f6b17c1a9b12723 Mon Sep 17 00:00:00 2001
From: Admin <admin@takomo.ai>
Date: Mon, 8 Sep 2025 14:16:56 +0000
Subject: [PATCH 4/5] GITBOOK-23: No subject

---
 SUMMARY.md                                    |  2 ++
 .../model-recipes/llama-4/README.md           |  6 ++++
 .../model-recipes/llama-4/usage-guide.md      | 30 +++++++++++++++++++
 3 files changed, 38 insertions(+)
 create mode 100644 sglang-cookbook/model-recipes/llama-4/README.md
 create mode 100644 sglang-cookbook/model-recipes/llama-4/usage-guide.md

diff --git a/SUMMARY.md b/SUMMARY.md
index e0bbea5..71da277 100644
--- a/SUMMARY.md
+++ b/SUMMARY.md
@@ -15,6 +15,8 @@
       * [Usage Guide](sglang-cookbook/model-recipes/deepseek-v3.1-v3-r1/usage-guide.md)
     * [GPT-OSS](sglang-cookbook/model-recipes/gpt-oss/README.md)
       * [Usage Guide](sglang-cookbook/model-recipes/gpt-oss/usage-guide.md)
+    * [Llama 4](sglang-cookbook/model-recipes/llama-4/README.md)
+      * [Usage Guide](sglang-cookbook/model-recipes/llama-4/usage-guide.md)
   * [API](sglang-cookbook/api/README.md)
     * [OpenAI APIs - Completions](sglang-cookbook/api/openai-apis-completions.md)
     * [OpenAI APIs - Vision](sglang-cookbook/api/openai-apis-vision.md)
diff --git a/sglang-cookbook/model-recipes/llama-4/README.md b/sglang-cookbook/model-recipes/llama-4/README.md
new file mode 100644
index 0000000..fc1964b
--- /dev/null
+++ b/sglang-cookbook/model-recipes/llama-4/README.md
@@ -0,0 +1,6 @@
+# Llama 4
+
+Llama 4 Scout
+
+<table><thead><tr><th>Weight Type</th><th width="249">Hardware Configuration</th><th data-type="content-ref">Instruction</th><th data-type="content-ref">Benchmark</th></tr></thead><tbody><tr><td></td><td>4 x H100/H200</td><td><a href="broken-reference">Broken link</a></td><td><a href="broken-reference">Broken link</a></td></tr><tr><td></td><td>8 x H100/H200</td><td><a href="broken-reference">Broken link</a></td><td></td></tr><tr><td></td><td>4 x MI300X</td><td></td><td></td></tr><tr><td><p></p><p></p></td><td></td><td></td><td></td></tr></tbody></table>
+
diff --git a/sglang-cookbook/model-recipes/llama-4/usage-guide.md b/sglang-cookbook/model-recipes/llama-4/usage-guide.md
new file mode 100644
index 0000000..63d1af9
--- /dev/null
+++ b/sglang-cookbook/model-recipes/llama-4/usage-guide.md
@@ -0,0 +1,30 @@
+# Usage Guide
+
+### <mark style="background-color:green;">Serving with 1 x 4 x H200</mark>
+
+{% stepper %}
+{% step %}
+#### Install SGLang
+
+
+{% endstep %}
+
+{% step %}
+#### Serve the model (text only)
+
+{% code overflow="wrap" %}
+```bash
+python3 -m sglang.launch_server \  
+  --model-path meta-llama/Llama-4-Scout-17B-16E-Instruct \  
+  --host 0.0.0.0 \  
+  --port 30000
+```
+{% endcode %}
+
+
+{% endstep %}
+
+{% step %}
+#### Benchmark
+{% endstep %}
+{% endstepper %}

From 215ab38677e7e8a22e01e5550ae826f849d75f7f Mon Sep 17 00:00:00 2001
From: zhenlinc <zhenlinc@stanford.edu>
Date: Sun, 5 Oct 2025 11:10:09 +0000
Subject: [PATCH 5/5] GITBOOK-28: No subject

---
 SUMMARY.md                                    |  4 ++
 .../model-recipes/llama-3.1-70b/README.md     |  2 +
 .../llama-3.1-70b/usage-guide.md              | 66 ++++++++++++++++++
 .../qwen3-next-80b-a3b/README.md              |  2 +
 .../qwen3-next-80b-a3b/usage-guide.md         | 68 +++++++++++++++++++
 5 files changed, 142 insertions(+)
 create mode 100644 sglang-cookbook/model-recipes/llama-3.1-70b/README.md
 create mode 100644 sglang-cookbook/model-recipes/llama-3.1-70b/usage-guide.md
 create mode 100644 sglang-cookbook/model-recipes/qwen3-next-80b-a3b/README.md
 create mode 100644 sglang-cookbook/model-recipes/qwen3-next-80b-a3b/usage-guide.md

diff --git a/SUMMARY.md b/SUMMARY.md
index 71da277..1277737 100644
--- a/SUMMARY.md
+++ b/SUMMARY.md
@@ -17,6 +17,10 @@
       * [Usage Guide](sglang-cookbook/model-recipes/gpt-oss/usage-guide.md)
     * [Llama 4](sglang-cookbook/model-recipes/llama-4/README.md)
       * [Usage Guide](sglang-cookbook/model-recipes/llama-4/usage-guide.md)
+    * [Llama-3.1-70B](sglang-cookbook/model-recipes/llama-3.1-70b/README.md)
+      * [Usage Guide](sglang-cookbook/model-recipes/llama-3.1-70b/usage-guide.md)
+    * [Qwen3-Next-80B-A3B](sglang-cookbook/model-recipes/qwen3-next-80b-a3b/README.md)
+      * [Usage Guide](sglang-cookbook/model-recipes/qwen3-next-80b-a3b/usage-guide.md)
   * [API](sglang-cookbook/api/README.md)
     * [OpenAI APIs - Completions](sglang-cookbook/api/openai-apis-completions.md)
     * [OpenAI APIs - Vision](sglang-cookbook/api/openai-apis-vision.md)
diff --git a/sglang-cookbook/model-recipes/llama-3.1-70b/README.md b/sglang-cookbook/model-recipes/llama-3.1-70b/README.md
new file mode 100644
index 0000000..561ff8f
--- /dev/null
+++ b/sglang-cookbook/model-recipes/llama-3.1-70b/README.md
@@ -0,0 +1,2 @@
+# Llama-3.1-70B
+
diff --git a/sglang-cookbook/model-recipes/llama-3.1-70b/usage-guide.md b/sglang-cookbook/model-recipes/llama-3.1-70b/usage-guide.md
new file mode 100644
index 0000000..091c419
--- /dev/null
+++ b/sglang-cookbook/model-recipes/llama-3.1-70b/usage-guide.md
@@ -0,0 +1,66 @@
+# Usage Guide
+
+### <mark style="background-color:green;">Serving with 1 x 4 x H200</mark>
+
+{% stepper %}
+{% step %}
+#### Install SGLang
+
+Following [the instruction](../../installation/nvidia-h-series-a-series-and-rtx-gpus.md)
+{% endstep %}
+
+{% step %}
+#### Serve the model
+
+```sh
+python3 -m sglang.launch_server \
+        --model meta-llama/Llama-3.1-70B-Instruct \
+        --tp 4 --trust-remote-code \
+        --mem-fraction-static 0.95 \
+        --port 30000 \
+        --attention-backend triton
+```
+{% endstep %}
+
+{% step %}
+#### Benchmark
+
+```shell
+# BS=1/Input=1024/Ouput=1024
+python3 -m sglang.bench_one_batch_server \
+        --model meta-llama/Llama-3.1-70B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 1 \
+        --input-len 1024 \
+        --output-len 1024
+
+
+# 1/8192/1024
+python3 -m sglang.bench_one_batch_server \
+        --model meta-llama/Llama-3.1-70B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 1 \
+        --input-len 8192 \
+        --output-len 1024
+
+# 8/1024/1024
+python3 -m sglang.bench_one_batch_server \
+        --model meta-llama/Llama-3.1-70B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 8 \
+        --input-len 1024 \
+        --output-len 1024
+
+# 8/8192/1024
+python3 -m sglang.bench_one_batch_server \
+        --model meta-llama/Llama-3.1-70B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 8 \
+        --input-len 8192 \
+        --output-len 1024
+```
+
+<table data-header-hidden><thead><tr><th width="227.57421875"></th><th width="92.63671875"></th><th width="85.66015625"></th><th></th><th></th></tr></thead><tbody><tr><td>BS/Input/Output Length</td><td>TTFT(s)</td><td>ITL(ms)</td><td>Input Throughput</td><td>Output Throughput</td></tr><tr><td>1/1024/1024</td><td>0.23</td><td>63</td><td>4418.12</td><td>15.73</td></tr><tr><td>1/8192/1024</td><td>2.19</td><td>50</td><td>3737.24</td><td>19.75</td></tr><tr><td>8/1024/1024</td><td>0.58</td><td>2</td><td>14052.0</td><td>479.11</td></tr><tr><td>8/8192/1024</td><td>5.22</td><td>3</td><td>12556.62</td><td>355.16</td></tr></tbody></table>
+{% endstep %}
+{% endstepper %}
+
diff --git a/sglang-cookbook/model-recipes/qwen3-next-80b-a3b/README.md b/sglang-cookbook/model-recipes/qwen3-next-80b-a3b/README.md
new file mode 100644
index 0000000..6cfe5c7
--- /dev/null
+++ b/sglang-cookbook/model-recipes/qwen3-next-80b-a3b/README.md
@@ -0,0 +1,2 @@
+# Qwen3-Next-80B-A3B
+
diff --git a/sglang-cookbook/model-recipes/qwen3-next-80b-a3b/usage-guide.md b/sglang-cookbook/model-recipes/qwen3-next-80b-a3b/usage-guide.md
new file mode 100644
index 0000000..7b79648
--- /dev/null
+++ b/sglang-cookbook/model-recipes/qwen3-next-80b-a3b/usage-guide.md
@@ -0,0 +1,68 @@
+# Usage Guide
+
+### <mark style="background-color:green;">Serving with 1 x 4 x H200</mark>
+
+{% stepper %}
+{% step %}
+#### Install SGLang
+
+Following [the instruction](../../installation/nvidia-h-series-a-series-and-rtx-gpus.md)
+{% endstep %}
+
+{% step %}
+#### Serve the model
+
+```sh
+python3 -m sglang.launch_server \
+        --model Qwen/Qwen3-Next-80B-A3B-Instruct \
+        --tp 4 --trust-remote-code \
+        --mem-fraction-static 0.95 \
+        --port 30000 \
+        --attention-backend triton
+```
+{% endstep %}
+
+{% step %}
+#### Benchmark
+
+```sh
+# BS=1/Input=1024/Ouput=1024
+python3 -m sglang.bench_one_batch_server \
+        --model Qwen/Qwen3-Next-80B-A3B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 1 \
+        --input-len 1024 \
+        --output-len 1024
+
+
+# 1/8192/1024
+python3 -m sglang.bench_one_batch_server \
+        --model Qwen/Qwen3-Next-80B-A3B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 1 \
+        --input-len 8192 \
+        --output-len 1024
+
+# 8/1024/1024
+python3 -m sglang.bench_one_batch_server \
+        --model Qwen/Qwen3-Next-80B-A3B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 8 \
+        --input-len 1024 \
+        --output-len 1024
+
+# 8/8192/1024
+python3 -m sglang.bench_one_batch_server \
+        --model Qwen/Qwen3-Next-80B-A3B-Instruct \
+        --base-url http://localhost:30000 \
+        --batch-size 8 \
+        --input-len 8192 \
+        --output-len 1024
+```
+
+<table data-header-hidden><thead><tr><th width="217.1875"></th><th width="91.25390625"></th><th width="87.07421875"></th><th></th><th></th></tr></thead><tbody><tr><td>BS/Input/Output Length</td><td>TTFT(s)</td><td>ITL(ms)</td><td>Input Throughput</td><td>Output Throughput</td></tr><tr><td>1/1024/1024</td><td>0.40</td><td>12</td><td>2547.99</td><td>81.41</td></tr><tr><td>1/8192/1024</td><td>1.27</td><td>15</td><td>6459.45</td><td>67.89</td></tr><tr><td>8/1024/1024</td><td>0.95</td><td>3</td><td>8665.16</td><td>289.15</td></tr><tr><td>8/8192/1024</td><td>3.17</td><td>2</td><td>20642.36</td><td>474.05</td></tr></tbody></table>
+
+\
+
+{% endstep %}
+{% endstepper %}

Weight Type	Hardware Configuration	Instruction
MXFP4 (recommended)	1 x H100/H200	#serving-with-1-x-h100-h200
	1 x B200	#serving-with-1-x-b200
	1 x MI300X
Full precision FP8/BF16	1 x H200
BS/Input/Output Length	TTFT(s)	ITL(ms)	Input Throughput	Output Throughput
1/1024/1024	0.05	3.29	22668.19	304.59
1/8192/1024	0.15	3.39	55870.90	295.09
8/1024/1024	0.12	5.92	65760.01	1350.83
8/8192/1024	1.05	6.62	62209.72	1209.10
BS/Input/Output Length	TTFT(s)	ITL(ms)	Input Throughput	Output Throughput
1/1024/1024	0.07	4.73	15803.59	211.49
1/8192/1024	0.23	4.89	35004.05	204.75
8/1024/1024	0.21	10.17	39132.98	786.63
8/8192/1024	1.76	11.20	37178.23	714.53
Hardware Configuration	Instruction	Benchmark
4 x H100/H200	Broken link	Broken link
8 x H100/H200	Broken link
4 x MI300X

BS/Input/Output Length	TTFT(s)	ITL(ms)	Input Throughput	Output Throughput
1/1024/1024	0.23	63	4418.12	15.73
1/8192/1024	2.19	50	3737.24	19.75
8/1024/1024	0.58	2	14052.0	479.11
8/8192/1024	5.22	3	12556.62	355.16