From d501a3f90587d30bea65f4c0ab19da419ca8fc24 Mon Sep 17 00:00:00 2001
From: hhu-scitix <hhu@scitix.ai>
Date: Fri, 11 Jul 2025 08:32:44 +0000
Subject: [PATCH 01/12] add pd disaggreation deploy in vllm and sglang

---
 .gitmodules                                   |   6 +
 .../k8s/inference/sglang/2p2d/d.yaml          | 212 ++++++++++
 .../k8s/inference/sglang/2p2d/lb.yaml         |  60 +++
 .../k8s/inference/sglang/2p2d/p.yaml          | 211 ++++++++++
 .../k8s/inference/sglang/4p9d/d.yaml          | 246 +++++++++++
 .../k8s/inference/sglang/4p9d/lb.yaml         |  68 ++++
 .../k8s/inference/sglang/4p9d/p.yaml          | 251 ++++++++++++
 .../inference/sglang/one-engine/server.yaml   |  89 ++++
 .../k8s/inference/vllm/nixl/2p2d.yaml         | 381 ++++++++++++++++++
 thirdparty/sglang                             |   1 +
 thirdparty/vllm                               |   1 +
 11 files changed, 1526 insertions(+)
 create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/d.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/p.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/4p9d/d.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/4p9d/p.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/one-engine/server.yaml
 create mode 100644 launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml
 create mode 160000 thirdparty/sglang
 create mode 160000 thirdparty/vllm

diff --git a/.gitmodules b/.gitmodules
index 475fe99..e1dbd83 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,9 @@
 [submodule "thirdparty/Megatron-LM"]
 	path = thirdparty/Megatron-LM
 	url = https://github.com/NVIDIA/Megatron-LM.git
+[submodule "thirdparty/vllm"]
+	path = thirdparty/vllm
+	url = https://github.com/vllm-project/vllm.git
+[submodule "thirdparty/sglang"]
+	path = thirdparty/sglang
+	url = https://github.com/sgl-project/sglang.git
diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml
new file mode 100644
index 0000000..8223063
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml
@@ -0,0 +1,212 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-2decode
+  namespace: t-hisys-xlliu
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        annotations:
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --chunked-prefill-size "262144" --page-size "64" --enable-dp-attention --enable-dp-lm-head \
+            --dp-size "16" --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \
+            --mem-fraction-static "0.849" --context-length "32768" \
+            --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+            --cuda-graph-max-bs "64" --max-running-requests "2048" \
+            --tp-size "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \
+            --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1"
+          env:
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "5"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 300
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 2
+    workerTemplate:
+      metadata:
+        annotations:
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --chunked-prefill-size "262144" --page-size "64" --enable-dp-attention --enable-dp-lm-head \
+            --dp-size "16" --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \
+            --mem-fraction-static "0.849" --context-length "32768" \
+            --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+            --cuda-graph-max-bs "64" --max-running-requests "2048" \
+            --tp-size "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \
+            --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1"
+          env:
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "5"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          # - name: NCCL_IB_TC
+          #   value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          # - name: NCCL_IB_SL
+          #   value: "5"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+  networkConfig:
+    subdomainPolicy: Shared
+  replicas: 1
+  rolloutStrategy:
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  startupPolicy: LeaderCreated
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-2decode-svc
+  namespace: t-hisys-xlliu
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-2decode
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml
new file mode 100644
index 0000000..bf5d960
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml
@@ -0,0 +1,60 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseekr10528-2p2d-lb
+  namespace: t-hisys-xlliu
+  labels:
+    app: deepseekr10528-2p2d-lb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseekr10528-2p2d-lb
+  template:
+    metadata:
+      labels:
+        app: deepseekr10528-2p2d-lb
+    spec:
+      containers:
+        - name: sgl-minilb
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          command:
+          - python
+          - -m
+          - sglang.srt.disaggregation.mini_lb
+          - --prefill
+          - http://deepseekr10528-2prefill-svc:30000
+          - --decode
+          - http://deepseekr10528-2decode-svc:30000
+          - --host
+          - 0.0.0.0
+          - --port
+          -  "8000"
+          ports:
+            - containerPort: 8000
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - hostPath:
+          path: /mnt/xstorage/model
+        name: model
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-2p2d-lb-svc
+  namespace: t-hisys-xlliu
+spec:
+  type: ClusterIP # NodePort is easy to test, you can also specify `ClusterIP`
+  selector:
+    app: deepseekr10528-2p2d-lb
+  ports:
+    - protocol: TCP
+      port: 8000         # Service Port（In-Cluster）
+      targetPort: 8000   # Exposed Container
diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml
new file mode 100644
index 0000000..d479bd3
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml
@@ -0,0 +1,211 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-2prefill
+  namespace: t-hisys-xlliu
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        annotations:
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \
+            --chunked-prefill-size "524288" --max-prefill-tokens "32768" \
+            --page-size "64" --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek \
+            --enable-dp-lm-head --enable-dp-attention --dp-size "16" --disable-radix-cache \
+            --enable-deepep-moe --deepep-mode normal --disaggregation-mode prefill \
+            --mem-fraction-static "0.7" --context-length "32768" \
+            --tp "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \
+            --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" --max-running-requests "1024"
+          env:
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "5"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_GID_INDEX
+            value: "5"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "false"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 300
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          # securityContext:
+          #   capabilities:
+          #     add:
+          #     - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 2
+    workerTemplate:
+      metadata:
+        annotations:
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \
+            --chunked-prefill-size "524288" --max-prefill-tokens "32768" \
+            --page-size "64" --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek \
+            --enable-dp-lm-head --enable-dp-attention --dp-size "16" --disable-radix-cache \
+            --enable-deepep-moe --deepep-mode normal --disaggregation-mode prefill \
+            --mem-fraction-static "0.7" --context-length "32768" \
+            --tp "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \
+            --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" --max-running-requests "1024"
+          env:
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "5"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "8"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
+            value: "0"
+          - name: NCCL_IB_GID_INDEX
+            value: "5"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-2prefill-svc
+  namespace: t-hisys-xlliu
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-2prefill
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml
new file mode 100644
index 0000000..684e202
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml
@@ -0,0 +1,246 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-decode
+  namespace: t-hisys-xlliu
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        annotations:
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}"
+            # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            # sleep inf
+            SGLANG_NUM_RESERVED_DECODE_TOKENS=102 MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 \
+            python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+            --disaggregation-mode decode --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $LWS_GROUP_SIZE --node-rank $LWS_WORKER_INDEX --tp-size 72 --dp-size 72 \
+            --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \
+            --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \
+            --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap --deepep-mode low_latency \
+            --mem-fraction-static 0.835 --max-running-requests 18432 --context-length 4500 \
+            --ep-num-redundant-experts 32 --cuda-graph-bs 256  2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/decode_${LWS_WORKER_INDEX}.log
+            # --init-expert-location YOUR_PATH --ep-num-redundant-experts 32 --cuda-graph-bs 256"
+          env:
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NCCL_IB_GID_INDEX
+            value: "5"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          # - name: NCCL_IB_TC
+          #   value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          # - name: NCCL_IB_SL
+          #   value: "5"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 300
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          # securityContext:
+          #   capabilities:
+          #     add:
+          #     - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+          - mountPath: /mnt/xstorage/xlliu
+            name: xlliu
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+        - hostPath:
+            path: /mnt/xstorage/xlliu
+          name: xlliu
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 9
+    workerTemplate:
+      metadata:
+        annotations:
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}"
+            # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            # sleep inf
+            SGLANG_NUM_RESERVED_DECODE_TOKENS=102 MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 \
+            python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+            --disaggregation-mode decode --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $LWS_GROUP_SIZE --node-rank $LWS_WORKER_INDEX --tp-size 72 --dp-size 72 \
+            --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \
+            --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \
+            --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap --deepep-mode low_latency \
+            --mem-fraction-static 0.835 --max-running-requests 18432 --context-length 4500 \
+            --ep-num-redundant-experts 32 --cuda-graph-bs 256 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/decode_${LWS_WORKER_INDEX}.log
+            # --init-expert-location YOUR_PATH --ep-num-redundant-experts 32 --cuda-graph-bs 256"
+          env:
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NCCL_IB_GID_INDEX
+            value: "5"
+          - name:  NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          # - name: NCCL_IB_TC
+          #   value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          # - name: NCCL_IB_SL
+          #   value: "5"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30001
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          # securityContext:
+          #   capabilities:
+          #     add:
+          #     - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+          - mountPath: /mnt/xstorage/xlliu
+            name: xlliu
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+        - hostPath:
+            path: /mnt/xstorage/xlliu
+          name: xlliu
+  networkConfig:
+    subdomainPolicy: Shared
+  replicas: 1
+  rolloutStrategy:
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+    type: RollingUpdate
+  startupPolicy: LeaderCreated
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-decode-svc
+  namespace: t-hisys-xlliu
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml
new file mode 100644
index 0000000..2d3267a
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml
@@ -0,0 +1,68 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseekr10528-lb-main
+  namespace: t-hisys-xlliu
+  labels:
+    app: deepseekr10528-lb
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseekr10528-lb
+  template:
+    metadata:
+      labels:
+        app: deepseekr10528-lb
+    spec:
+      # nodeSelector:
+      #     bo: "yes"
+      # tolerations:
+      #   - key: bopd
+      #     operator: Exists
+      #   - key: node-role
+      #     operator: Exists
+      containers:
+        - name: sgl-minilb
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          command:
+          - python
+          - -m
+          - sglang.srt.disaggregation.mini_lb
+          - --prefill
+          - http://deepseekr10528-prefill-svc:30000
+          - --decode
+          - http://deepseekr10528-decode-svc:30000
+          - --host
+          - 0.0.0.0
+          - --port
+          -  "8000"
+          ports:
+            - containerPort: 8000
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+      volumes:
+      - emptyDir:
+          medium: Memory
+        name: dshm
+      - hostPath:
+          path: /mnt/xstorage/model
+        name: model
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-lb-svc
+  namespace: t-hisys-xlliu
+spec:
+  type: ClusterIP # NodePort is easy to test, you can also specify `ClusterIP`
+  selector:
+    app: deepseekr10528-lb
+  ports:
+    - protocol: TCP
+      port: 8000         # Service Port（In-Cluster）
+      targetPort: 8000   # Exposed Container
+      # nodePort: 30800
diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml
new file mode 100644
index 0000000..887676e
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml
@@ -0,0 +1,251 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseekr10528-prefill
+  namespace: t-hisys-xlliu
+spec:
+  leaderWorkerTemplate:
+    leaderTemplate:
+      metadata:
+        annotations:
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+        labels:
+          role: leader
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}"
+            # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            # sleep inf
+            MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \
+            --disaggregation-mode prefill --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \
+            --tp-size 32 --dp-size 32 \
+            --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \
+            --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \
+            --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap --deepep-mode normal \
+            --mem-fraction-static 0.85 --chunked-prefill-size 524288 \
+            --max-running-requests 8192 --max-total-tokens 131072 \
+            --context-length 8192 --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/prefill_${LWS_WORKER_INDEX}.log
+            # --context-length 8192 --init-expert-location YOUR_PATH \
+            # --ep-num-redundant-experts 32 --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek --deepep-config YOUR_PATH
+          env:
+          - name: NVSHMEM_HCA_PE_MAPPING
+            # should modify according your rdma env
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "5"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_DEBUG
+            value: INFO
+          # - name: NCCL_IB_TC
+          #   value: "160"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "false"
+          # - name: NCCL_IB_SL
+          #   value: "5"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-leader
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            periodSeconds: 300
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          # securityContext:
+          #   capabilities:
+          #     add:
+          #     - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+          - mountPath: /mnt/xstorage/xlliu
+            name: xlliu
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+        - hostPath:
+            path: /mnt/xstorage/xlliu
+          name: xlliu
+    restartPolicy: RecreateGroupOnPodRestart
+    size: 4
+    workerTemplate:
+      metadata:
+        annotations:
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+          k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+      spec:
+        containers:
+        - command:
+          - /usr/bin/env
+          - bash
+          - -c
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}"
+            # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            # sleep inf
+            MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+            --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \
+            --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \
+            --disaggregation-mode prefill --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \
+            --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \
+            --tp-size 32 --dp-size 32 \
+            --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \
+            --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \
+            --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \
+            --enable-two-batch-overlap --deepep-mode normal \
+            --mem-fraction-static 0.85 --chunked-prefill-size 524288 \
+            --max-running-requests 8192 --max-total-tokens 131072 \
+            --context-length 8192 --ep-num-redundant-experts 32 \
+            --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/prefill_${LWS_WORKER_INDEX}.log
+            # --context-length 8192 --init-expert-location YOUR_PATH \
+            # --ep-num-redundant-experts 32 --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek --deepep-config YOUR_PATH
+          env:
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            # should modify according your rdma env
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NCCL_IB_HCA
+            value: ^=mlx5_0,mlx5_5,mlx5_6
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "8"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD
+            value: "0"
+          - name: NCCL_IB_GID_INDEX
+            value: "5"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          # - name: NCCL_IB_TC
+          #   value: "136"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          # - name: NCCL_IB_SL
+          #   value: "5"
+          - name: LWS_WORKER_INDEX
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index']
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          name: sglang-worker
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          # securityContext:
+          #   capabilities:
+          #     add:
+          #     - IPC_LOCK
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt/xstorage/model
+            name: model
+          - mountPath: /mnt/xstorage/xlliu
+            name: xlliu
+        nodeSelector:
+          unit: "2"
+        volumes:
+        - emptyDir:
+            medium: Memory
+          name: dshm
+        - hostPath:
+            path: /mnt/xstorage/model
+          name: model
+        - hostPath:
+            path: /mnt/xstorage/xlliu
+          name: xlliu
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseekr10528-prefill-svc
+  namespace: t-hisys-xlliu
+spec:
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill
+    role: leader
+  ports:
+    - protocol: TCP
+      port: 30000
+      targetPort: 30000
diff --git a/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml b/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml
new file mode 100644
index 0000000..9b97c37
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml
@@ -0,0 +1,89 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: sglang-server
+  namespace: t-hisys-xlliu
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: sglang
+  template:
+    metadata:
+      labels:
+        app: sglang
+      annotations:
+        k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: unit
+                    operator: In
+                    values:
+                      - "2"
+      restartPolicy: Always
+      containers:
+        - name: sglang
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest
+          env:
+          - name: NCCL_IB_GID_INDEX
+            value: "5"
+          - name: NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+          command: ["python3"]
+          args:
+            - "-m"
+            - "sglang.launch_server"
+            - "--model"
+            - "/root/.cache/huggingface/deepseek-ai/DeepSeek-V3-0324"
+            - "--tp"
+            - "8"
+            - "--trust-remote-code"
+            - "--port"
+            - "30000"
+          ports:
+            - containerPort: 30000
+          resources:
+          resources:
+            limits:
+              nvidia.com/gpu: 8
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+            - name: huggingface-cache
+              mountPath: /root/.cache/huggingface
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: huggingface-cache
+          hostPath:
+            path: /mnt/xstorage/model  
+        - name: shm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 32Gi
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-service
+  namespace: t-hisys-xlliu
+spec:
+  type: NodePort
+  selector:
+    app: sglang
+  ports:
+    - port: 30000         
+      targetPort: 30000   
+      nodePort: 30000   
diff --git a/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml b/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml
new file mode 100644
index 0000000..1ccddd6
--- /dev/null
+++ b/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml
@@ -0,0 +1,381 @@
+# vllm-2p2d-all.yaml
+
+---
+# Prefill A
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-prefill-a
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-prefill-a
+  template:
+    metadata:
+      labels:
+        app: vllm-prefill-a
+        scitix.ai/topo-aware-in-node: "true"
+      annotations:
+        sidecar.istio.io/inject: "false"
+        k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024]
+      containers:
+        - name: prefill-a
+          image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm
+          command: ["/usr/bin/env", "bash", "-c"]
+          args:
+            - |
+              export VLLM_IS_PREFILL=1
+              export VLLM_NIXL_SIDE_CHANNEL_PORT=5557
+              export NCCL_DEBUG=INFO
+              CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+              vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \
+                --port 8100 \
+                --tensor-parallel-size 8 \
+                --enforce-eager \
+                --disable-log-requests \
+                --block-size 128 \
+                --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
+              sleep inf
+          resources:
+            limits:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+            requests:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+            - name: everything
+              mountPath: /data
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: everything
+          hostPath:
+            path: /mnt/xstorage
+        - name: shm
+          hostPath:
+            path: /dev/shm
+
+---
+# Prefill B
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-prefill-b
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-prefill-b
+  template:
+    metadata:
+      labels:
+        app: vllm-prefill-b
+        scitix.ai/topo-aware-in-node: "true"
+      annotations:
+        sidecar.istio.io/inject: "false"
+        k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024]
+      containers:
+        - name: prefill-b
+          image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm
+          command: ["/usr/bin/env", "bash", "-c"]
+          args:
+            - |
+              export VLLM_IS_PREFILL=1
+              export VLLM_NIXL_SIDE_CHANNEL_PORT=5558
+              CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+              vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \
+                --port 8101 \
+                --tensor-parallel-size 8 \
+                --enforce-eager \
+                --disable-log-requests \
+                --block-size 128 \
+                --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
+              sleep inf
+          resources:
+            limits:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+            requests:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+            - name: everything
+              mountPath: /data
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: everything
+          hostPath:
+            path: /mnt/xstorage
+        - name: shm
+          hostPath:
+            path: /dev/shm
+
+---
+# Decode A
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-decode-a
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-decode-a
+  template:
+    metadata:
+      labels:
+        app: vllm-decode-a
+        scitix.ai/topo-aware-in-node: "true"
+      annotations:
+        sidecar.istio.io/inject: "false"
+        k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024]
+      containers:
+        - name: decode-a
+          image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm
+          command: ["/usr/bin/env", "bash", "-c"]
+          args:
+            - |
+              export VLLM_IS_PREFILL=0
+              export VLLM_NIXL_SIDE_CHANNEL_PORT=5559
+              CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+              vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \
+                --port 8200 \
+                --tensor-parallel-size 8 \
+                --enforce-eager \
+                --disable-log-requests \
+                --block-size 128 \
+                --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
+              sleep inf
+          resources:
+            limits:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+            requests:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+            - name: everything
+              mountPath: /data
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: everything
+          hostPath:
+            path: /mnt/xstorage
+        - name: shm
+          hostPath:
+            path: /dev/shm
+
+---
+# Decode B
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-decode-b
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-decode-b
+  template:
+    metadata:
+      labels:
+        app: vllm-decode-b
+        scitix.ai/topo-aware-in-node: "true"
+      annotations:
+        sidecar.istio.io/inject: "false"
+        k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+              - matchExpressions:
+                  - key: kubernetes.io/hostname
+                    operator: In
+                    values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024]
+      containers:
+        - name: decode-b
+          image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm
+          command: ["/usr/bin/env", "bash", "-c"]
+          args:
+            - |
+              export VLLM_IS_PREFILL=0
+              export VLLM_NIXL_SIDE_CHANNEL_PORT=5560
+              CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+              vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \
+                --port 8201 \
+                --tensor-parallel-size 8 \
+                --enforce-eager \
+                --disable-log-requests \
+                --block-size 128 \
+                --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' &
+              sleep inf
+          resources:
+            limits:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+            requests:
+              cpu: "80"
+              memory: 1000Gi
+              nvidia.com/gpu: "8"
+              nvidia.com/rdma0: "1"
+              nvidia.com/rdma1: "1"
+              nvidia.com/rdma2: "1"
+              nvidia.com/rdma3: "1"
+              nvidia.com/rdma4: "1"
+              nvidia.com/rdma5: "1"
+              nvidia.com/rdma6: "1"
+              nvidia.com/rdma7: "1"
+          volumeMounts:
+            - name: everything
+              mountPath: /data
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: everything
+          hostPath:
+            path: /mnt/xstorage
+        - name: shm
+          hostPath:
+            path: /dev/shm
+
+---
+# Proxy
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: vllm-proxy
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: vllm-proxy
+  template:
+    metadata:
+      labels:
+        app: vllm-proxy
+      annotations:
+        sidecar.istio.io/inject: "false"
+    spec:
+      containers:
+        - name: proxy
+          image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm
+          command: ["/usr/bin/env", "bash", "-c"]
+          args:
+            - |
+              python3 /data/vllm/toy_proxy_server.py \
+                --port 8192 \
+                --prefiller-port 8100 8101 \
+                --decoder-port 8200 8201 &
+              sleep inf
+          volumeMounts:
+            - name: everything
+              mountPath: /data
+            - name: shm
+              mountPath: /dev/shm
+      volumes:
+        - name: everything
+          hostPath:
+            path: /mnt/xstorage
+        - name: shm
+          hostPath:
+            path: /dev/shm
diff --git a/thirdparty/sglang b/thirdparty/sglang
new file mode 160000
index 0000000..8604471
--- /dev/null
+++ b/thirdparty/sglang
@@ -0,0 +1 @@
+Subproject commit 86044712c6492df3ceb5a5cf025a575ab3989061
diff --git a/thirdparty/vllm b/thirdparty/vllm
new file mode 160000
index 0000000..8020e98
--- /dev/null
+++ b/thirdparty/vllm
@@ -0,0 +1 @@
+Subproject commit 8020e98c9f033e76c97eb8261f772d59eba49c9a

From 8542f4ab7a467a728295317515841b609ca0bb17 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:33:40 +0800
Subject: [PATCH 02/12] Create 1p2d_d.yaml

---
 .../k8s/inference/sglang/2p2d/1p2d_d.yaml     | 197 ++++++++++++++++++
 1 file changed, 197 insertions(+)
 create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml
new file mode 100644
index 0000000..334d3a4
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml
@@ -0,0 +1,197 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: alan-deepseek-decode
+  namespace: llm
+spec:
+  leaderWorkerTemplate:
+    size: 2
+    leaderTemplate:
+      metadata:
+        annotations:
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+        labels:
+          role: decode-leader
+      spec:
+        hostNetwork: true
+        hostIPC: true
+        dnsPolicy: ClusterFirstWithHostNet
+        nodeSelector:
+          deepseek-pool: "true"
+        containers:
+        - name: sglang-leader
+          image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env","bash","-c"]
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+              --model-path /data/DeepSeek-V3-0324 \
+              --chunked-prefill-size "262144" --page-size "64" \
+              --enable-dp-attention --enable-dp-lm-head --dp-size "2" \
+              --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \
+              --mem-fraction-static "0.849" --context-length "32768" \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+              --cuda-graph-max-bs "64" --max-running-requests "2048" \
+              --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes 2 --node-rank 0 \
+              --trust-remote-code  \
+              --disaggregation-transfer-backend nixl
+          env:
+          - name: SGLANG_HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: GLOO_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: CUDA_LAUNCH_BLOCKING
+            value: "0"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            initialDelaySeconds: 300
+            timeoutSeconds: 300
+            periodSeconds: 300
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /data
+            name: model
+        volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: model
+          hostPath: { path: /data }
+
+    workerTemplate:
+      metadata:
+        annotations:
+          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
+        labels:
+          role: decode-worker
+      spec:
+        hostNetwork: true
+        hostIPC: true
+        dnsPolicy: ClusterFirstWithHostNet
+        nodeSelector:
+          deepseek-pool: "true"
+        containers:
+        - name: sglang-worker
+          image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env","bash","-c"]
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
+              --model-path /data/DeepSeek-V3-0324 \
+              --chunked-prefill-size "262144" --page-size "64" \
+              --enable-dp-attention --enable-dp-lm-head --dp-size "2" \
+              --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \
+              --mem-fraction-static "0.849" --context-length "32768" \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+              --cuda-graph-max-bs "64" --max-running-requests "2048" \
+              --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \
+              --trust-remote-code \
+              --disaggregation-transfer-backend nixl
+          env:
+          - name: SGLANG_HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: GLOO_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_SOCKET_IFNAME
+            value: eth0
+          - name: NVSHMEM_IB_TRAFFIC_CLASS
+            value: "16"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NCCL_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: "none"
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGLANG_MOONCAKE_TRANS_THREAD
+            value: "16"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            initialDelaySeconds: 300
+            timeoutSeconds: 300
+            periodSeconds: 300
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /data
+            name: model
+        volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: model
+          hostPath: { path: /data }
+
+  replicas: 1
+  rolloutStrategy:
+    type: RollingUpdate
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+  startupPolicy: LeaderCreated

From 6abd3e50c95fbca908cec066bf0beb1b173df644 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:34:13 +0800
Subject: [PATCH 03/12] Create 1p2d_p.yaml

---
 .../k8s/inference/sglang/2p2d/1p2d_p.yaml     | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
new file mode 100644
index 0000000..2a61d75
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
@@ -0,0 +1,99 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: alan-deepseek-prefill
+  namespace: llm
+spec:
+  leaderWorkerTemplate:
+    size: 1
+    leaderTemplate:
+      metadata:
+        labels:
+          app: alan-deepseek
+          role: prefill-leader
+          leaderworkerset.sigs.k8s.io/role: leader
+      spec:
+        dnsPolicy: ClusterFirst
+        containers:
+        - name: sglang-leader
+          image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+          command: ["/usr/bin/env","bash","-c"]
+          args:
+          - |
+            set -euxo pipefail
+            export POD_IP="$(hostname -i)"
+            echo "PREFILL leader on ${POD_IP}:30000"
+            exec python3 -m sglang.launch_server \
+              --port 30000 \
+              --host "0.0.0.0" \
+              --model-path /data/DeepSeek-V3-0324 \
+              --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \
+              --chunked-prefill-size 524288 \
+              --max-prefill-tokens 32768 \
+              --page-size 64 \
+              --disable-radix-cache \
+              --enable-deepep-moe --deepep-mode normal \
+              --disaggregation-mode prefill \
+              --mem-fraction-static 0.85 --context-length 32768 \
+              --tp 8 \
+              --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \
+              --trust-remote-code --max-running-requests 1024 \
+              --disaggregation-transfer-backend nixl
+          env:
+          # 保留这些网络/通信相关 env；LWS_* 由控制器自动注入，无需自己用 fieldRef 再注入
+          - name: GLOO_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_SOCKET_IFNAME
+            value: eth0
+          - name: NCCL_IB_GID_INDEX
+            value: "3"
+          - name: NCCL_IB_QPS_PER_CONNECTION
+            value: "8"
+          - name: NCCL_IB_SPLIT_DATA_ON_QPS
+            value: "1"
+          - name: NCCL_NET_PLUGIN
+            value: none
+          - name: NCCL_MIN_NCHANNELS
+            value: "4"
+          - name: SGLANG_SET_CPU_AFFINITY
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+            value: "1"
+          - name: NVSHMEM_IB_GID_INDEX
+            value: "3"
+          - name: NVSHMEM_HCA_PE_MAPPING
+            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+          ports:
+          - containerPort: 30000
+          - containerPort: 8998   # NIXL bootstrap
+          readinessProbe:
+            tcpSocket: { port: 30000 }
+            initialDelaySeconds: 10
+            periodSeconds: 5
+            timeoutSeconds: 3
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+          - { name: dshm,  mountPath: /dev/shm }
+          - { name: model, mountPath: /data }
+        volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: model
+          hostPath: { path: /data }
+    # 1P 不会创建 worker；如不需要可整段删除
+    workerTemplate:
+      metadata:
+        labels:
+          app: alan-deepseek
+          role: prefill-worker
+          leaderworkerset.sigs.k8s.io/role: worker
+      spec:
+        containers:
+        - name: noop
+          image: busybox
+          command: ["sh","-c","sleep 3600000"]

From 95bbbf8752d585cfa014a0f32233a46267606bd5 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:34:52 +0800
Subject: [PATCH 04/12] Create 1p2d_lb.yaml

---
 .../k8s/inference/sglang/2p2d/1p2d_lb.yaml    | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml
new file mode 100644
index 0000000..f9e77bc
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml
@@ -0,0 +1,51 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: alan-deepseek-lb
+  namespace: llm
+  labels: {app: alan-deepseek}
+spec:
+  replicas: 1
+  selector:
+    matchLabels: {app: alan-deepseek, tier: lb}
+  template:
+    metadata:
+      labels: {app: alan-deepseek, tier: lb}
+    spec:
+      containers:
+      - name: sgl-minilb
+        image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+        command: ["python","-m","sglang.srt.disaggregation.mini_lb",
+                  "--prefill","http://alan-deepseek-prefill-svc:30000",
+                  "--decode","http://alan-deepseek-decode-svc:30000",
+                  "--host","0.0.0.0","--port","8000"]
+        ports:
+        - containerPort: 8000
+        readinessProbe:
+          tcpSocket: {port: 8000}
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 3
+        volumeMounts:
+        - name: model
+          mountPath: /data
+      volumes:
+      - name: model
+        hostPath: {path: /data}
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alan-deepseek-svc
+  namespace: llm
+spec:
+  type: ClusterIP
+  selector:
+    app: alan-deepseek
+    tier: lb
+  ports:
+    - name: http
+      protocol: TCP
+      port: 8000
+      targetPort: 8000

From 1a4fb726ad394dab555f9c95cc9bcd62642cbd29 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Mon, 8 Sep 2025 10:35:26 +0800
Subject: [PATCH 05/12] Create svc.yaml

---
 .../k8s/inference/sglang/2p2d/svc.yaml        | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml
new file mode 100644
index 0000000..c7a4404
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml
@@ -0,0 +1,37 @@
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alan-deepseek-prefill-svc
+  namespace: llm
+spec:
+  type: ClusterIP
+  selector:
+    leaderworkerset.sigs.k8s.io/name: alan-deepseek-prefill
+    leaderworkerset.sigs.k8s.io/role: leader
+  ports:
+  - name: http
+    protocol: TCP
+    port: 30000
+    targetPort: 30000
+  - name: nixl-boot
+    protocol: TCP
+    port: 8998
+    targetPort: 8998
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alan-deepseek-decode-svc
+  namespace: llm
+spec:
+  type: ClusterIP
+  selector:
+    leaderworkerset.sigs.k8s.io/name: alan-deepseek-decode
+    leaderworkerset.sigs.k8s.io/role: leader
+  ports:
+  - name: http
+    protocol: TCP
+    port: 30000
+    targetPort: 30000

From c4fe1d36ab043df2cc64d75f614fc962b4103f2c Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:17:41 +0800
Subject: [PATCH 06/12] Update 1p2d_p.yaml

---
 .../k8s/inference/sglang/2p2d/1p2d_p.yaml     | 27 ++++++++-----------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
index 2a61d75..cdfc793 100644
--- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
@@ -2,7 +2,7 @@ apiVersion: leaderworkerset.x-k8s.io/v1
 kind: LeaderWorkerSet
 metadata:
   name: alan-deepseek-prefill
-  namespace: llm
+  namespace: t-ai-infra-qqxu03
 spec:
   leaderWorkerTemplate:
     size: 1
@@ -13,23 +13,23 @@ spec:
           role: prefill-leader
           leaderworkerset.sigs.k8s.io/role: leader
       spec:
+        nodeSelector:
+          "siflow.scitix.ai/resource-pool-name": hisys-inference
         dnsPolicy: ClusterFirst
         containers:
         - name: sglang-leader
-          image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+          image: lmsysorg/sglang:v0.5.1.post2-cu126
           command: ["/usr/bin/env","bash","-c"]
           args:
           - |
             set -euxo pipefail
             export POD_IP="$(hostname -i)"
-            echo "PREFILL leader on ${POD_IP}:30000"
             exec python3 -m sglang.launch_server \
               --port 30000 \
               --host "0.0.0.0" \
-              --model-path /data/DeepSeek-V3-0324 \
-              --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \
+              --model-path /data/DeepSeek-R1 \
+              --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 \
               --chunked-prefill-size 524288 \
-              --max-prefill-tokens 32768 \
               --page-size 64 \
               --disable-radix-cache \
               --enable-deepep-moe --deepep-mode normal \
@@ -41,7 +41,7 @@ spec:
               --trust-remote-code --max-running-requests 1024 \
               --disaggregation-transfer-backend nixl
           env:
-          # 保留这些网络/通信相关 env；LWS_* 由控制器自动注入，无需自己用 fieldRef 再注入
+          # 这些参数可以花时间看看哪些是可以去掉的，一直没太搞明白cc瑞哥
           - name: GLOO_SOCKET_IFNAME
             value: eth0
           - name: NCCL_SOCKET_IFNAME
@@ -50,8 +50,6 @@ spec:
             value: "3"
           - name: NCCL_IB_QPS_PER_CONNECTION
             value: "8"
-          - name: NCCL_IB_SPLIT_DATA_ON_QPS
-            value: "1"
           - name: NCCL_NET_PLUGIN
             value: none
           - name: NCCL_MIN_NCHANNELS
@@ -60,12 +58,6 @@ spec:
             value: "true"
           - name: SGL_ENABLE_JIT_DEEPGEMM
             value: "1"
-          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
-            value: "1"
-          - name: NVSHMEM_IB_GID_INDEX
-            value: "3"
-          - name: NVSHMEM_HCA_PE_MAPPING
-            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
           ports:
           - containerPort: 30000
           - containerPort: 8998   # NIXL bootstrap
@@ -84,7 +76,8 @@ spec:
         - name: dshm
           emptyDir: { medium: Memory }
         - name: model
-          hostPath: { path: /data }
+          persistentVolumeClaim:
+            claimName: siflow-models
     # 1P 不会创建 worker；如不需要可整段删除
     workerTemplate:
       metadata:
@@ -93,6 +86,8 @@ spec:
           role: prefill-worker
           leaderworkerset.sigs.k8s.io/role: worker
       spec:
+        nodeSelector:
+          "siflow.scitix.ai/resource-pool-name": hisys-inference
         containers:
         - name: noop
           image: busybox

From 54c89eafad6fa89b8dee3d975c30c2effc9fafd8 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:18:08 +0800
Subject: [PATCH 07/12] Update 1p2d_p.yaml

---
 launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
index cdfc793..bcbc1fb 100644
--- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
@@ -78,7 +78,7 @@ spec:
         - name: model
           persistentVolumeClaim:
             claimName: siflow-models
-    # 1P 不会创建 worker；如不需要可整段删除
+    # 1P 不会创建 worker；如不需要可整段删除，如果多个p需要，因此保留
     workerTemplate:
       metadata:
         labels:

From f9fb6ef4f0ce8afde830f4512d457c560bfca8df Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:18:42 +0800
Subject: [PATCH 08/12] Update 1p2d_d.yaml

---
 .../k8s/inference/sglang/2p2d/1p2d_d.yaml     | 126 ++++++++----------
 1 file changed, 58 insertions(+), 68 deletions(-)

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml
index 334d3a4..858a3a4 100644
--- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml
@@ -2,25 +2,24 @@ apiVersion: leaderworkerset.x-k8s.io/v1
 kind: LeaderWorkerSet
 metadata:
   name: alan-deepseek-decode
-  namespace: llm
+  namespace: t-ai-infra-qqxu03
 spec:
   leaderWorkerTemplate:
     size: 2
     leaderTemplate:
       metadata:
-        annotations:
-          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
         labels:
+          leaderworkerset.sigs.k8s.io/role: leader
           role: decode-leader
       spec:
         hostNetwork: true
         hostIPC: true
         dnsPolicy: ClusterFirstWithHostNet
         nodeSelector:
-          deepseek-pool: "true"
+          "siflow.scitix.ai/resource-pool-name": hisys-inference
         containers:
         - name: sglang-leader
-          image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+          image: lmsysorg/sglang:v0.5.1.post2-cu126
           securityContext:
             privileged: true
           command: ["/usr/bin/env","bash","-c"]
@@ -30,44 +29,40 @@ spec:
             echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
             echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
             exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
-              --model-path /data/DeepSeek-V3-0324 \
+              --model-path /data/DeepSeek-R1 \
               --chunked-prefill-size "262144" --page-size "64" \
-              --enable-dp-attention --enable-dp-lm-head --dp-size "2" \
+              --enable-dp-attention --enable-dp-lm-head --dp-size "16" \
               --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \
               --mem-fraction-static "0.849" --context-length "32768" \
-              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
               --cuda-graph-max-bs "64" --max-running-requests "2048" \
-              --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
-              --nnodes 2 --node-rank 0 \
-              --trust-remote-code  \
-              --disaggregation-transfer-backend nixl
+              --eplb-rebalance-layers-per-chunk  "29" \
+              --tp-size "16" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes 2 --node-rank 0 --moe-dense-tp-size "1" \
+              --trust-remote-code  --disaggregation-transfer-backend nixl
           env:
           - name: SGLANG_HOST_IP
             valueFrom: {fieldRef: {fieldPath: status.hostIP}}
           - name: HOST_IP
             valueFrom: {fieldRef: {fieldPath: status.hostIP}}
-          - name: GLOO_SOCKET_IFNAME
-            value: eth0
-          - name: NCCL_SOCKET_IFNAME
-            value: eth0
-          - name: NCCL_IB_GID_INDEX
-            value: "3"
-          - name: NVSHMEM_IB_GID_INDEX
-            value: "3"
-          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
-            value: "1"
-          - name: NVSHMEM_HCA_PE_MAPPING
-            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
-          - name: NCCL_IB_QPS_PER_CONNECTION
-            value: "8"
-          - name: NCCL_IB_SPLIT_DATA_ON_QPS
-            value: "1"
-          - name: CUDA_LAUNCH_BLOCKING
-            value: "0"
-          - name: NCCL_NET_PLUGIN
-            value: "none"
-          - name: NCCL_MIN_NCHANNELS
-            value: "4"
+              # - name: NCCL_IB_GID_INDEX
+              # value: "3"
+              #   - name: NVSHMEM_IB_GID_INDEX
+              #  value: "3"
+              #     - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+              # value: "1"
+              #  - name: NVSHMEM_HCA_PE_MAPPING
+              #   value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+              # - name: NCCL_IB_QPS_PER_CONNECTION
+              # value: "8"
+              # - name: NCCL_IB_SPLIT_DATA_ON_QPS
+              # value: "1"
+              #  - name: CUDA_LAUNCH_BLOCKING
+              #   value: "0"
+              #      - name: NCCL_NET_PLUGIN
+              #      value: "none"
+              #      - name: NCCL_MIN_NCHANNELS
+              #    value: "4"
           - name: MC_TE_METRIC
             value: "true"
           - name: SGLANG_MOONCAKE_TRANS_THREAD
@@ -95,12 +90,11 @@ spec:
         - name: dshm
           emptyDir: { medium: Memory }
         - name: model
-          hostPath: { path: /data }
+          persistentVolumeClaim:
+            claimName: siflow-models
 
     workerTemplate:
       metadata:
-        annotations:
-          roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}'
         labels:
           role: decode-worker
       spec:
@@ -108,10 +102,10 @@ spec:
         hostIPC: true
         dnsPolicy: ClusterFirstWithHostNet
         nodeSelector:
-          deepseek-pool: "true"
+          "siflow.scitix.ai/resource-pool-name": hisys-inference
         containers:
         - name: sglang-worker
-          image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+          image: lmsysorg/sglang:v0.5.1.post2-cu126
           securityContext:
             privileged: true
           command: ["/usr/bin/env","bash","-c"]
@@ -121,44 +115,39 @@ spec:
             echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
             echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
             exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \
-              --model-path /data/DeepSeek-V3-0324 \
+              --model-path /data/DeepSeek-R1 \
               --chunked-prefill-size "262144" --page-size "64" \
-              --enable-dp-attention --enable-dp-lm-head --dp-size "2" \
+              --enable-dp-attention --enable-dp-lm-head --dp-size "16" \
               --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \
               --mem-fraction-static "0.849" --context-length "32768" \
-              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
               --cuda-graph-max-bs "64" --max-running-requests "2048" \
-              --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --tp-size "16" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
               --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \
-              --trust-remote-code \
-              --disaggregation-transfer-backend nixl
+              --trust-remote-code --moe-dense-tp-size "1" --disaggregation-transfer-backend nixl
           env:
           - name: SGLANG_HOST_IP
             valueFrom: {fieldRef: {fieldPath: status.hostIP}}
           - name: HOST_IP
             valueFrom: {fieldRef: {fieldPath: status.hostIP}}
-          - name: GLOO_SOCKET_IFNAME
-            value: eth0
-          - name: NCCL_SOCKET_IFNAME
-            value: eth0
-          - name: NVSHMEM_IB_TRAFFIC_CLASS
-            value: "16"
-          - name: NVSHMEM_IB_GID_INDEX
-            value: "3"
-          - name: NCCL_IB_GID_INDEX
-            value: "3"
-          - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
-            value: "1"
-          - name: NVSHMEM_HCA_PE_MAPPING
-            value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
-          - name: NCCL_IB_QPS_PER_CONNECTION
-            value: "8"
-          - name: NCCL_IB_SPLIT_DATA_ON_QPS
-            value: "1"
-          - name: NCCL_NET_PLUGIN
-            value: "none"
-          - name: NCCL_MIN_NCHANNELS
-            value: "4"
+              # - name: NVSHMEM_IB_TRAFFIC_CLASS
+              # value: "16"
+              #  - name: NVSHMEM_IB_GID_INDEX
+              # value: "3"
+              #  - name: NCCL_IB_GID_INDEX
+              # value: "3"
+              # - name: NVSHMEM_ENABLE_NIC_PE_MAPPING
+              #  value: "1"
+              #     - name: NVSHMEM_HCA_PE_MAPPING
+              # value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2"
+              #        - name: NCCL_IB_QPS_PER_CONNECTION
+              #       value: "#8"
+              #      - name: NCCL_IB_SPLIT_DATA_ON_QPS
+              #  value: "1"
+              #      - name: NCCL_NET_PLUGIN
+              #       value: "none#"
+              #     - name: NCCL_MIN_NCHANNELS
+              #    value: "4"
           - name: MC_TE_METRIC
             value: "true"
           - name: SGLANG_MOONCAKE_TRANS_THREAD
@@ -186,7 +175,8 @@ spec:
         - name: dshm
           emptyDir: { medium: Memory }
         - name: model
-          hostPath: { path: /data }
+          persistentVolumeClaim:
+            claimName: siflow-models
 
   replicas: 1
   rolloutStrategy:

From 2a107e6c6b8116d92180f6813c1cfa7531e10184 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:19:14 +0800
Subject: [PATCH 09/12] Update 1p2d_lb.yaml

---
 .../k8s/inference/sglang/2p2d/1p2d_lb.yaml           | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml
index f9e77bc..9614da9 100644
--- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml
@@ -2,7 +2,7 @@ apiVersion: apps/v1
 kind: Deployment
 metadata:
   name: alan-deepseek-lb
-  namespace: llm
+  namespace: t-ai-infra-qqxu03
   labels: {app: alan-deepseek}
 spec:
   replicas: 1
@@ -12,11 +12,14 @@ spec:
     metadata:
       labels: {app: alan-deepseek, tier: lb}
     spec:
+      nodeSelector:
+        "siflow.scitix.ai/resource-pool-name": hisys-inference
       containers:
       - name: sgl-minilb
-        image: sealos.hub:5000/sglang:v0.5.1.post1-cu126
+        image: lmsysorg/sglang:v0.5.1.post2-cu126
         command: ["python","-m","sglang.srt.disaggregation.mini_lb",
                   "--prefill","http://alan-deepseek-prefill-svc:30000",
+                  "--prefill-bootstrap-ports","8998",
                   "--decode","http://alan-deepseek-decode-svc:30000",
                   "--host","0.0.0.0","--port","8000"]
         ports:
@@ -31,14 +34,15 @@ spec:
           mountPath: /data
       volumes:
       - name: model
-        hostPath: {path: /data}
+        persistentVolumeClaim:
+          claimName: siflow-models
 
 ---
 apiVersion: v1
 kind: Service
 metadata:
   name: alan-deepseek-svc
-  namespace: llm
+  namespace: t-ai-infra-qqxu03
 spec:
   type: ClusterIP
   selector:

From 210bd008426310fa7829372cb3c21487d736f231 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Wed, 10 Sep 2025 11:19:45 +0800
Subject: [PATCH 10/12] Update svc.yaml

---
 launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml
index c7a4404..0750ccd 100644
--- a/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml
@@ -3,7 +3,7 @@ apiVersion: v1
 kind: Service
 metadata:
   name: alan-deepseek-prefill-svc
-  namespace: llm
+  namespace: t-ai-infra-qqxu03
 spec:
   type: ClusterIP
   selector:
@@ -24,7 +24,7 @@ apiVersion: v1
 kind: Service
 metadata:
   name: alan-deepseek-decode-svc
-  namespace: llm
+  namespace: t-ai-infra-qqxu03
 spec:
   type: ClusterIP
   selector:

From 30c51f0874dbb78dc8329d516a4538149df8de96 Mon Sep 17 00:00:00 2001
From: ziang663 <119752791+ziang663@users.noreply.github.com>
Date: Wed, 10 Sep 2025 14:40:56 +0800
Subject: [PATCH 11/12] Update 1p2d_p.yaml

---
 .../k8s/inference/sglang/2p2d/1p2d_p.yaml         | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
index bcbc1fb..18b2831 100644
--- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
+++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml
@@ -13,9 +13,11 @@ spec:
           role: prefill-leader
           leaderworkerset.sigs.k8s.io/role: leader
       spec:
+        hostNetwork: true
+        hostIPC: true
+        dnsPolicy: ClusterFirstWithHostNet
         nodeSelector:
           "siflow.scitix.ai/resource-pool-name": hisys-inference
-        dnsPolicy: ClusterFirst
         containers:
         - name: sglang-leader
           image: lmsysorg/sglang:v0.5.1.post2-cu126
@@ -41,11 +43,14 @@ spec:
               --trust-remote-code --max-running-requests 1024 \
               --disaggregation-transfer-backend nixl
           env:
-          # 这些参数可以花时间看看哪些是可以去掉的，一直没太搞明白cc瑞哥
+          - name: SGLANG_HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
           - name: GLOO_SOCKET_IFNAME
-            value: eth0
+            value: bond0
           - name: NCCL_SOCKET_IFNAME
-            value: eth0
+            value: bond0
           - name: NCCL_IB_GID_INDEX
             value: "3"
           - name: NCCL_IB_QPS_PER_CONNECTION
@@ -78,7 +83,6 @@ spec:
         - name: model
           persistentVolumeClaim:
             claimName: siflow-models
-    # 1P 不会创建 worker；如不需要可整段删除，如果多个p需要，因此保留
     workerTemplate:
       metadata:
         labels:
@@ -92,3 +96,4 @@ spec:
         - name: noop
           image: busybox
           command: ["sh","-c","sleep 3600000"]
+

From f1c189b5c5628cb5048f03a8ba7eb0a680346185 Mon Sep 17 00:00:00 2001
From: jrshe <jrshe02@siflow.cn>
Date: Mon, 15 Sep 2025 07:43:04 +0000
Subject: [PATCH 12/12] Add 1p1d yamls

---
 .../k8s/inference/sglang/1p1d/d.yaml          | 176 ++++++++++++++++++
 .../k8s/inference/sglang/1p1d/lb.yaml         |  61 ++++++
 .../k8s/inference/sglang/1p1d/p.yaml          |  85 +++++++++
 .../k8s/inference/sglang/1p1d/svc.yaml        |  35 ++++
 4 files changed, 357 insertions(+)
 create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/d.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/p.yaml
 create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml

diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml
new file mode 100644
index 0000000..a2d23b0
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml
@@ -0,0 +1,176 @@
+apiVersion: leaderworkerset.x-k8s.io/v1
+kind: LeaderWorkerSet
+metadata:
+  name: deepseek-decode
+  namespace: default
+spec:
+  leaderWorkerTemplate:
+    size: 2
+    leaderTemplate:
+      metadata:
+        labels:
+          leaderworkerset.sigs.k8s.io/role: leader
+          role: decode-leader
+      spec:
+        hostNetwork: true
+        hostIPC: true
+        dnsPolicy: ClusterFirstWithHostNet
+        nodeSelector:
+          "scitix.ai/gpu-type": h20xnvlink141
+          "roce.scitix.ai/unit": unit1
+        containers:
+        - name: sglang-leader
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env","bash","-c"]
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            exec python3 -m sglang.launch_server \
+              --host "0.0.0.0" \
+              --port 30000 \
+              --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
+              --chunked-prefill-size 20480 \
+              --page-size 64 \
+              --enable-deepep-moe \
+              --deepep-mode low_latency \
+              --disaggregation-mode decode \
+              --mem-fraction-static 0.85 \
+              --context-length 32768 \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
+              --cuda-graph-max-bs 64 \
+              --max-running-requests 2048 \
+              --eplb-rebalance-layers-per-chunk 29 \
+              --tp-size 16 \
+              --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes 2 \
+              --node-rank 0 \
+              --moe-dense-tp-size 1 \
+              --trust-remote-code \
+              --disaggregation-transfer-backend nixl \
+              --enable-dp-attention \
+              --enable-dp-lm-head \
+              --dp-size 8
+          env:
+          - name: SGLANG_HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            initialDelaySeconds: 100
+            timeoutSeconds: 300
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt
+            name: mnt
+        volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: mnt
+          hostPath:
+            path: /mnt/xstorage
+
+    workerTemplate:
+      metadata:
+        labels:
+          role: decode-worker
+      spec:
+        hostNetwork: true
+        hostIPC: true
+        dnsPolicy: ClusterFirstWithHostNet
+        nodeSelector:
+          "scitix.ai/gpu-type": h20xnvlink141
+          "roce.scitix.ai/unit": unit1
+        containers:
+        - name: sglang-worker
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env","bash","-c"]
+          args:
+          - |-
+            echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS"
+            echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE"
+            echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX"
+            exec python3 -m sglang.launch_server \
+              --host "0.0.0.0" \
+              --port 30000 \
+              --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
+              --chunked-prefill-size 20480 \
+              --page-size 64 \
+              --enable-deepep-moe \
+              --deepep-mode low_latency \
+              --disaggregation-mode decode \
+              --mem-fraction-static 0.85 \
+              --context-length 32768 \
+              --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
+              --cuda-graph-max-bs 64 \
+              --max-running-requests 2048 \
+              --tp-size 16 \
+              --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \
+              --nnodes ${LWS_GROUP_SIZE} \
+              --node-rank ${LWS_WORKER_INDEX} \
+              --trust-remote-code \
+              --moe-dense-tp-size 1 \
+              --disaggregation-transfer-backend nixl \
+              --enable-dp-attention \
+              --enable-dp-lm-head \
+              --dp-size 8
+
+          env:
+          - name: SGLANG_HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: HOST_IP
+            valueFrom: {fieldRef: {fieldPath: status.hostIP}}
+          - name: MC_TE_METRIC
+            value: "true"
+          - name: SGL_ENABLE_JIT_DEEPGEMM
+            value: "1"
+          ports:
+          - containerPort: 30000
+            protocol: TCP
+          readinessProbe:
+            initialDelaySeconds: 100
+            timeoutSeconds: 300
+            periodSeconds: 30
+            tcpSocket:
+              port: 30000
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+          - mountPath: /dev/shm
+            name: dshm
+          - mountPath: /mnt
+            name: mnt
+        volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: mnt
+          hostPath:
+            path: /mnt/xstorage
+
+  replicas: 1
+  rolloutStrategy:
+    type: RollingUpdate
+    rollingUpdateConfiguration:
+      maxSurge: 0
+      maxUnavailable: 1
+  startupPolicy: LeaderCreated
\ No newline at end of file
diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml
new file mode 100644
index 0000000..5cc16ce
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml
@@ -0,0 +1,61 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseek-lb
+  namespace: default
+  labels: {app: deepseek, tier: lb}
+spec:
+  replicas: 1
+  selector:
+    matchLabels: {app: deepseek, tier: lb}
+  template:
+    metadata:
+      labels: {app: deepseek, tier: lb}
+    spec:
+      hostNetwork: true
+      hostIPC: true
+      dnsPolicy: ClusterFirstWithHostNet
+      nodeSelector:
+        "roce.scitix.ai/unit": unit1
+      containers:
+      - name: sglang-minilb
+        image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+        command: ["/usr/bin/env", "bash", "-c"]
+        args:
+          - |
+            exec python -m sglang.srt.disaggregation.mini_lb \
+              --prefill "http://deepseek-prefill-svc:30000" \
+              --decode "http://deepseek-decode-svc:30000" \
+              --host "0.0.0.0" \
+              --port 8000
+        ports:
+        - containerPort: 8000
+          name: http
+        readinessProbe:
+          tcpSocket: {port: 8000}
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 3
+        volumeMounts:
+        - name: mnt
+          mountPath: /mnt
+      volumes:
+      - name: mnt
+        hostPath:
+          path: /mnt/xstorage
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseek-svc
+  namespace: default
+spec:
+  type: ClusterIP
+  selector:
+    app: deepseek
+    tier: lb
+  ports:
+    - name: http
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml
new file mode 100644
index 0000000..4d47e37
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml
@@ -0,0 +1,85 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: deepseek-prefill
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: deepseek-prefill
+  template:
+    metadata:
+      labels:
+        app: deepseek-prefill
+    spec:
+      hostNetwork: true
+      hostIPC: true
+      dnsPolicy: ClusterFirstWithHostNet
+      nodeSelector:
+        "scitix.ai/gpu-type": h20xnvlink141
+        "roce.scitix.ai/unit": unit1
+      containers:
+        - name: sglang-prefill
+          image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126
+          securityContext:
+            privileged: true
+          command: ["/usr/bin/env", "bash", "-c"]
+          args:
+            - |
+              set -euxo pipefail
+              export POD_IP="$(hostname -i)"
+              exec python3 -m sglang.launch_server \
+                --host "0.0.0.0" \
+                --port 30000 \
+                --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \
+                --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \
+                --chunked-prefill-size 20480 \
+                --page-size 64 \
+                --disable-radix-cache \
+                --enable-deepep-moe \
+                --deepep-mode normal \
+                --disaggregation-mode prefill \
+                --mem-fraction-static 0.85 \
+                --context-length 32768 \
+                --tp-size 8 \
+                --trust-remote-code \
+                --max-running-requests 1024 \
+                --disaggregation-transfer-backend nixl
+          env:
+            - name: GLOO_SOCKET_IFNAME
+              value: bond0
+            - name: NCCL_SOCKET_IFNAME
+              value: bond0
+            - name: NCCL_IB_GID_INDEX
+              value: "3"
+            - name: NCCL_IB_QPS_PER_CONNECTION
+              value: "8"
+            - name: NCCL_NET_PLUGIN
+              value: none
+            - name: NCCL_MIN_NCHANNELS
+              value: "4"
+            - name: SGLANG_SET_CPU_AFFINITY
+              value: "true"
+            - name: SGL_ENABLE_JIT_DEEPGEMM
+              value: "1"
+          ports:
+            - containerPort: 30000
+            - containerPort: 8998
+          readinessProbe:
+            tcpSocket: { port: 30000 }
+            initialDelaySeconds: 10
+            periodSeconds: 5
+            timeoutSeconds: 3
+          resources:
+            limits:
+              nvidia.com/gpu: "8"
+          volumeMounts:
+            - { name: dshm, mountPath: /dev/shm }
+            - { name: mnt, mountPath: /mnt }
+      volumes:
+        - name: dshm
+          emptyDir: { medium: Memory }
+        - name: mnt
+          hostPath:
+            path: /mnt/xstorage
\ No newline at end of file
diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml
new file mode 100644
index 0000000..e1db8e3
--- /dev/null
+++ b/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml
@@ -0,0 +1,35 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseek-prefill-svc
+  namespace: default
+spec:
+  type: ClusterIP
+  selector:
+    app: deepseek-prefill
+  ports:
+  - name: http
+    protocol: TCP
+    port: 30000
+    targetPort: 30000
+  - name: nixl-boot
+    protocol: TCP
+    port: 8998
+    targetPort: 8998
+
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: deepseek-decode-svc
+  namespace: default
+spec:
+  type: ClusterIP
+  selector:
+    leaderworkerset.sigs.k8s.io/name: deepseek-decode
+    leaderworkerset.sigs.k8s.io/role: leader
+  ports:
+  - name: http
+    protocol: TCP
+    port: 30000
+    targetPort: 30000
\ No newline at end of file