From d501a3f90587d30bea65f4c0ab19da419ca8fc24 Mon Sep 17 00:00:00 2001 From: hhu-scitix Date: Fri, 11 Jul 2025 08:32:44 +0000 Subject: [PATCH 01/12] add pd disaggreation deploy in vllm and sglang --- .gitmodules | 6 + .../k8s/inference/sglang/2p2d/d.yaml | 212 ++++++++++ .../k8s/inference/sglang/2p2d/lb.yaml | 60 +++ .../k8s/inference/sglang/2p2d/p.yaml | 211 ++++++++++ .../k8s/inference/sglang/4p9d/d.yaml | 246 +++++++++++ .../k8s/inference/sglang/4p9d/lb.yaml | 68 ++++ .../k8s/inference/sglang/4p9d/p.yaml | 251 ++++++++++++ .../inference/sglang/one-engine/server.yaml | 89 ++++ .../k8s/inference/vllm/nixl/2p2d.yaml | 381 ++++++++++++++++++ thirdparty/sglang | 1 + thirdparty/vllm | 1 + 11 files changed, 1526 insertions(+) create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/d.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/p.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/4p9d/d.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/4p9d/p.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/one-engine/server.yaml create mode 100644 launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml create mode 160000 thirdparty/sglang create mode 160000 thirdparty/vllm diff --git a/.gitmodules b/.gitmodules index 475fe99..e1dbd83 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,9 @@ [submodule "thirdparty/Megatron-LM"] path = thirdparty/Megatron-LM url = https://github.com/NVIDIA/Megatron-LM.git +[submodule "thirdparty/vllm"] + path = thirdparty/vllm + url = https://github.com/vllm-project/vllm.git +[submodule "thirdparty/sglang"] + path = thirdparty/sglang + url = https://github.com/sgl-project/sglang.git diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml new file mode 100644 index 0000000..8223063 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/d.yaml @@ -0,0 +1,212 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-2decode + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --chunked-prefill-size "262144" --page-size "64" --enable-dp-attention --enable-dp-lm-head \ + --dp-size "16" --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --tp-size "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" + env: + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --chunked-prefill-size "262144" --page-size "64" --enable-dp-attention --enable-dp-lm-head \ + --dp-size "16" --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --tp-size "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" + env: + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + # - name: NCCL_IB_SL + # value: "5" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + networkConfig: + subdomainPolicy: Shared + replicas: 1 + rolloutStrategy: + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + type: RollingUpdate + startupPolicy: LeaderCreated +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-2decode-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-2decode + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml new file mode 100644 index 0000000..bf5d960 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/lb.yaml @@ -0,0 +1,60 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseekr10528-2p2d-lb + namespace: t-hisys-xlliu + labels: + app: deepseekr10528-2p2d-lb +spec: + replicas: 1 + selector: + matchLabels: + app: deepseekr10528-2p2d-lb + template: + metadata: + labels: + app: deepseekr10528-2p2d-lb + spec: + containers: + - name: sgl-minilb + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + command: + - python + - -m + - sglang.srt.disaggregation.mini_lb + - --prefill + - http://deepseekr10528-2prefill-svc:30000 + - --decode + - http://deepseekr10528-2decode-svc:30000 + - --host + - 0.0.0.0 + - --port + - "8000" + ports: + - containerPort: 8000 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-2p2d-lb-svc + namespace: t-hisys-xlliu +spec: + type: ClusterIP # NodePort is easy to test, you can also specify `ClusterIP` + selector: + app: deepseekr10528-2p2d-lb + ports: + - protocol: TCP + port: 8000 # Service Port(In-Cluster) + targetPort: 8000 # Exposed Container diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml new file mode 100644 index 0000000..d479bd3 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/p.yaml @@ -0,0 +1,211 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-2prefill + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --chunked-prefill-size "524288" --max-prefill-tokens "32768" \ + --page-size "64" --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek \ + --enable-dp-lm-head --enable-dp-attention --dp-size "16" --disable-radix-cache \ + --enable-deepep-moe --deepep-mode normal --disaggregation-mode prefill \ + --mem-fraction-static "0.7" --context-length "32768" \ + --tp "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" --max-running-requests "1024" + env: + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "false" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + restartPolicy: RecreateGroupOnPodRestart + size: 2 + workerTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --chunked-prefill-size "524288" --max-prefill-tokens "32768" \ + --page-size "64" --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek \ + --enable-dp-lm-head --enable-dp-attention --dp-size "16" --disable-radix-cache \ + --enable-deepep-moe --deepep-mode normal --disaggregation-mode prefill \ + --mem-fraction-static "0.7" --context-length "32768" \ + --tp "16" --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --trust-remote-code --ep-num-redundant-experts "32" --moe-dense-tp-size "1" --max-running-requests "1024" + env: + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "8" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD + value: "0" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-2prefill-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-2prefill + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml new file mode 100644 index 0000000..684e202 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/4p9d/d.yaml @@ -0,0 +1,246 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-decode + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + SGLANG_NUM_RESERVED_DECODE_TOKENS=102 MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 \ + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --disaggregation-mode decode --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $LWS_GROUP_SIZE --node-rank $LWS_WORKER_INDEX --tp-size 72 --dp-size 72 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode low_latency \ + --mem-fraction-static 0.835 --max-running-requests 18432 --context-length 4500 \ + --ep-num-redundant-experts 32 --cuda-graph-bs 256 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/decode_${LWS_WORKER_INDEX}.log + # --init-expert-location YOUR_PATH --ep-num-redundant-experts 32 --cuda-graph-bs 256" + env: + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + # - name: NCCL_IB_SL + # value: "5" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu + restartPolicy: RecreateGroupOnPodRestart + size: 9 + workerTemplate: + metadata: + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + SGLANG_NUM_RESERVED_DECODE_TOKENS=102 MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 \ + python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --disaggregation-mode decode --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $LWS_GROUP_SIZE --node-rank $LWS_WORKER_INDEX --tp-size 72 --dp-size 72 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode low_latency \ + --mem-fraction-static 0.835 --max-running-requests 18432 --context-length 4500 \ + --ep-num-redundant-experts 32 --cuda-graph-bs 256 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/decode_${LWS_WORKER_INDEX}.log + # --init-expert-location YOUR_PATH --ep-num-redundant-experts 32 --cuda-graph-bs 256" + env: + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + # - name: NCCL_IB_SL + # value: "5" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30001 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu + networkConfig: + subdomainPolicy: Shared + replicas: 1 + rolloutStrategy: + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + type: RollingUpdate + startupPolicy: LeaderCreated +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-decode-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-decode + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml new file mode 100644 index 0000000..2d3267a --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/4p9d/lb.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseekr10528-lb-main + namespace: t-hisys-xlliu + labels: + app: deepseekr10528-lb +spec: + replicas: 1 + selector: + matchLabels: + app: deepseekr10528-lb + template: + metadata: + labels: + app: deepseekr10528-lb + spec: + # nodeSelector: + # bo: "yes" + # tolerations: + # - key: bopd + # operator: Exists + # - key: node-role + # operator: Exists + containers: + - name: sgl-minilb + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + command: + - python + - -m + - sglang.srt.disaggregation.mini_lb + - --prefill + - http://deepseekr10528-prefill-svc:30000 + - --decode + - http://deepseekr10528-decode-svc:30000 + - --host + - 0.0.0.0 + - --port + - "8000" + ports: + - containerPort: 8000 + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-lb-svc + namespace: t-hisys-xlliu +spec: + type: ClusterIP # NodePort is easy to test, you can also specify `ClusterIP` + selector: + app: deepseekr10528-lb + ports: + - protocol: TCP + port: 8000 # Service Port(In-Cluster) + targetPort: 8000 # Exposed Container + # nodePort: 30800 diff --git a/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml b/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml new file mode 100644 index 0000000..887676e --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/4p9d/p.yaml @@ -0,0 +1,251 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseekr10528-prefill + namespace: t-hisys-xlliu +spec: + leaderWorkerTemplate: + leaderTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + labels: + role: leader + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --disaggregation-mode prefill --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --tp-size 32 --dp-size 32 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode normal \ + --mem-fraction-static 0.85 --chunked-prefill-size 524288 \ + --max-running-requests 8192 --max-total-tokens 131072 \ + --context-length 8192 --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/prefill_${LWS_WORKER_INDEX}.log + # --context-length 8192 --init-expert-location YOUR_PATH \ + # --ep-num-redundant-experts 32 --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek --deepep-config YOUR_PATH + env: + - name: NVSHMEM_HCA_PE_MAPPING + # should modify according your rdma env + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NVSHMEM_IB_GID_INDEX + value: "5" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_DEBUG + value: INFO + # - name: NCCL_IB_TC + # value: "160" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "false" + # - name: NCCL_IB_SL + # value: "5" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-leader + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu + restartPolicy: RecreateGroupOnPodRestart + size: 4 + workerTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + containers: + - command: + - /usr/bin/env + - bash + - -c + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + # LWS_LEADER_ADDRESS="${LWS_LEADER_ADDRESS%%.*}" + # echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + # sleep inf + MC_TE_METRIC=true SGLANG_TBO_DEBUG=1 python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /mnt/xstorage/model/deepseek-ai/DeepSeek-R1-0528 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --disaggregation-mode prefill --dist-init-addr $(LWS_LEADER_ADDRESS):20102 \ + --nnodes $(LWS_GROUP_SIZE) --node-rank $(LWS_WORKER_INDEX) \ + --tp-size 32 --dp-size 32 \ + --enable-dp-attention --decode-log-interval 1 --enable-deepep-moe \ + --page-size 1 --host 0.0.0.0 --trust-remote-code --moe-dense-tp-size 1 \ + --enable-dp-lm-head --disable-radix-cache --watchdog-timeout 1000000 \ + --enable-two-batch-overlap --deepep-mode normal \ + --mem-fraction-static 0.85 --chunked-prefill-size 524288 \ + --max-running-requests 8192 --max-total-tokens 131072 \ + --context-length 8192 --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek 2>&1 | tee /mnt/xstorage/xlliu/sglang/DeepSeek-R1-0528/logs/prefill_${LWS_WORKER_INDEX}.log + # --context-length 8192 --init-expert-location YOUR_PATH \ + # --ep-num-redundant-experts 32 --ep-dispatch-algorithm dynamic --eplb-algorithm deepseek --deepep-config YOUR_PATH + env: + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: NVSHMEM_HCA_PE_MAPPING + # should modify according your rdma env + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_HCA + value: ^=mlx5_0,mlx5_5,mlx5_6 + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "8" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: SGL_CHUNKED_PREFIX_CACHE_THRESHOLD + value: "0" + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + # - name: NCCL_IB_TC + # value: "136" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + # - name: NCCL_IB_SL + # value: "5" + - name: LWS_WORKER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.labels['leaderworkerset.sigs.k8s.io/worker-index'] + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + name: sglang-worker + ports: + - containerPort: 30000 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + # securityContext: + # capabilities: + # add: + # - IPC_LOCK + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt/xstorage/model + name: model + - mountPath: /mnt/xstorage/xlliu + name: xlliu + nodeSelector: + unit: "2" + volumes: + - emptyDir: + medium: Memory + name: dshm + - hostPath: + path: /mnt/xstorage/model + name: model + - hostPath: + path: /mnt/xstorage/xlliu + name: xlliu +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseekr10528-prefill-svc + namespace: t-hisys-xlliu +spec: + selector: + leaderworkerset.sigs.k8s.io/name: deepseekr10528-prefill + role: leader + ports: + - protocol: TCP + port: 30000 + targetPort: 30000 diff --git a/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml b/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml new file mode 100644 index 0000000..9b97c37 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/one-engine/server.yaml @@ -0,0 +1,89 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sglang-server + namespace: t-hisys-xlliu +spec: + replicas: 1 + selector: + matchLabels: + app: sglang + template: + metadata: + labels: + app: sglang + annotations: + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: unit + operator: In + values: + - "2" + restartPolicy: Always + containers: + - name: sglang + image: registry-cn-shanghai.siflow.cn/hisys/sglang:latest + env: + - name: NCCL_IB_GID_INDEX + value: "5" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + command: ["python3"] + args: + - "-m" + - "sglang.launch_server" + - "--model" + - "/root/.cache/huggingface/deepseek-ai/DeepSeek-V3-0324" + - "--tp" + - "8" + - "--trust-remote-code" + - "--port" + - "30000" + ports: + - containerPort: 30000 + resources: + resources: + limits: + nvidia.com/gpu: 8 + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: huggingface-cache + mountPath: /root/.cache/huggingface + - name: shm + mountPath: /dev/shm + volumes: + - name: huggingface-cache + hostPath: + path: /mnt/xstorage/model + - name: shm + emptyDir: + medium: Memory + sizeLimit: 32Gi +--- +apiVersion: v1 +kind: Service +metadata: + name: sglang-service + namespace: t-hisys-xlliu +spec: + type: NodePort + selector: + app: sglang + ports: + - port: 30000 + targetPort: 30000 + nodePort: 30000 diff --git a/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml b/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml new file mode 100644 index 0000000..1ccddd6 --- /dev/null +++ b/launcher_scripts/k8s/inference/vllm/nixl/2p2d.yaml @@ -0,0 +1,381 @@ +# vllm-2p2d-all.yaml + +--- +# Prefill A +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-prefill-a + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-prefill-a + template: + metadata: + labels: + app: vllm-prefill-a + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: prefill-a + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=1 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5557 + export NCCL_DEBUG=INFO + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8100 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Prefill B +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-prefill-b + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-prefill-b + template: + metadata: + labels: + app: vllm-prefill-b + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: prefill-b + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=1 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5558 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8101 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Decode A +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-decode-a + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-decode-a + template: + metadata: + labels: + app: vllm-decode-a + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: decode-a + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=0 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5559 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8200 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Decode B +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-decode-b + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-decode-b + template: + metadata: + labels: + app: vllm-decode-b + scitix.ai/topo-aware-in-node: "true" + annotations: + sidecar.istio.io/inject: "false" + k8s.v1.cni.cncf.io/networks: kube-system/rdma0,kube-system/rdma1,kube-system/rdma2,kube-system/rdma3,kube-system/rdma4,kube-system/rdma5,kube-system/rdma6,kube-system/rdma7 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [gpu-node-017, gpu-node-018, gpu-node-022, gpu-node-023, gpu-node-024] + containers: + - name: decode-b + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + export VLLM_IS_PREFILL=0 + export VLLM_NIXL_SIDE_CHANNEL_PORT=5560 + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ + vllm serve /data/model/deepseek-ai/DeepSeek-R1-0528/ \ + --port 8201 \ + --tensor-parallel-size 8 \ + --enforce-eager \ + --disable-log-requests \ + --block-size 128 \ + --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' & + sleep inf + resources: + limits: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + requests: + cpu: "80" + memory: 1000Gi + nvidia.com/gpu: "8" + nvidia.com/rdma0: "1" + nvidia.com/rdma1: "1" + nvidia.com/rdma2: "1" + nvidia.com/rdma3: "1" + nvidia.com/rdma4: "1" + nvidia.com/rdma5: "1" + nvidia.com/rdma6: "1" + nvidia.com/rdma7: "1" + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm + +--- +# Proxy +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-proxy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vllm-proxy + template: + metadata: + labels: + app: vllm-proxy + annotations: + sidecar.istio.io/inject: "false" + spec: + containers: + - name: proxy + image: registry-cn-shanghai.siflow.cn/hisys/pytorch:25.04-vllm + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + python3 /data/vllm/toy_proxy_server.py \ + --port 8192 \ + --prefiller-port 8100 8101 \ + --decoder-port 8200 8201 & + sleep inf + volumeMounts: + - name: everything + mountPath: /data + - name: shm + mountPath: /dev/shm + volumes: + - name: everything + hostPath: + path: /mnt/xstorage + - name: shm + hostPath: + path: /dev/shm diff --git a/thirdparty/sglang b/thirdparty/sglang new file mode 160000 index 0000000..8604471 --- /dev/null +++ b/thirdparty/sglang @@ -0,0 +1 @@ +Subproject commit 86044712c6492df3ceb5a5cf025a575ab3989061 diff --git a/thirdparty/vllm b/thirdparty/vllm new file mode 160000 index 0000000..8020e98 --- /dev/null +++ b/thirdparty/vllm @@ -0,0 +1 @@ +Subproject commit 8020e98c9f033e76c97eb8261f772d59eba49c9a From 8542f4ab7a467a728295317515841b609ca0bb17 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Mon, 8 Sep 2025 10:33:40 +0800 Subject: [PATCH 02/12] Create 1p2d_d.yaml --- .../k8s/inference/sglang/2p2d/1p2d_d.yaml | 197 ++++++++++++++++++ 1 file changed, 197 insertions(+) create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml new file mode 100644 index 0000000..334d3a4 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml @@ -0,0 +1,197 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: alan-deepseek-decode + namespace: llm +spec: + leaderWorkerTemplate: + size: 2 + leaderTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + labels: + role: decode-leader + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + deepseek-pool: "true" + containers: + - name: sglang-leader + image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /data/DeepSeek-V3-0324 \ + --chunked-prefill-size "262144" --page-size "64" \ + --enable-dp-attention --enable-dp-lm-head --dp-size "2" \ + --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes 2 --node-rank 0 \ + --trust-remote-code \ + --disaggregation-transfer-backend nixl + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_GID_INDEX + value: "3" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: CUDA_LAUNCH_BLOCKING + value: "0" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 300 + timeoutSeconds: 300 + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /data + name: model + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: model + hostPath: { path: /data } + + workerTemplate: + metadata: + annotations: + roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' + labels: + role: decode-worker + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + deepseek-pool: "true" + containers: + - name: sglang-worker + image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ + --model-path /data/DeepSeek-V3-0324 \ + --chunked-prefill-size "262144" --page-size "64" \ + --enable-dp-attention --enable-dp-lm-head --dp-size "2" \ + --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ + --mem-fraction-static "0.849" --context-length "32768" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --cuda-graph-max-bs "64" --max-running-requests "2048" \ + --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \ + --trust-remote-code \ + --disaggregation-transfer-backend nixl + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NVSHMEM_IB_TRAFFIC_CLASS + value: "16" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NCCL_IB_GID_INDEX + value: "3" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: "none" + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_MOONCAKE_TRANS_THREAD + value: "16" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 300 + timeoutSeconds: 300 + periodSeconds: 300 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /data + name: model + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: model + hostPath: { path: /data } + + replicas: 1 + rolloutStrategy: + type: RollingUpdate + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + startupPolicy: LeaderCreated From 6abd3e50c95fbca908cec066bf0beb1b173df644 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Mon, 8 Sep 2025 10:34:13 +0800 Subject: [PATCH 03/12] Create 1p2d_p.yaml --- .../k8s/inference/sglang/2p2d/1p2d_p.yaml | 99 +++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml new file mode 100644 index 0000000..2a61d75 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml @@ -0,0 +1,99 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: alan-deepseek-prefill + namespace: llm +spec: + leaderWorkerTemplate: + size: 1 + leaderTemplate: + metadata: + labels: + app: alan-deepseek + role: prefill-leader + leaderworkerset.sigs.k8s.io/role: leader + spec: + dnsPolicy: ClusterFirst + containers: + - name: sglang-leader + image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + command: ["/usr/bin/env","bash","-c"] + args: + - | + set -euxo pipefail + export POD_IP="$(hostname -i)" + echo "PREFILL leader on ${POD_IP}:30000" + exec python3 -m sglang.launch_server \ + --port 30000 \ + --host "0.0.0.0" \ + --model-path /data/DeepSeek-V3-0324 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --chunked-prefill-size 524288 \ + --max-prefill-tokens 32768 \ + --page-size 64 \ + --disable-radix-cache \ + --enable-deepep-moe --deepep-mode normal \ + --disaggregation-mode prefill \ + --mem-fraction-static 0.85 --context-length 32768 \ + --tp 8 \ + --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \ + --trust-remote-code --max-running-requests 1024 \ + --disaggregation-transfer-backend nixl + env: + # 保留这些网络/通信相关 env;LWS_* 由控制器自动注入,无需自己用 fieldRef 再注入 + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: NCCL_SOCKET_IFNAME + value: eth0 + - name: NCCL_IB_GID_INDEX + value: "3" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_IB_SPLIT_DATA_ON_QPS + value: "1" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + value: "1" + - name: NVSHMEM_IB_GID_INDEX + value: "3" + - name: NVSHMEM_HCA_PE_MAPPING + value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + ports: + - containerPort: 30000 + - containerPort: 8998 # NIXL bootstrap + readinessProbe: + tcpSocket: { port: 30000 } + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - { name: dshm, mountPath: /dev/shm } + - { name: model, mountPath: /data } + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: model + hostPath: { path: /data } + # 1P 不会创建 worker;如不需要可整段删除 + workerTemplate: + metadata: + labels: + app: alan-deepseek + role: prefill-worker + leaderworkerset.sigs.k8s.io/role: worker + spec: + containers: + - name: noop + image: busybox + command: ["sh","-c","sleep 3600000"] From 95bbbf8752d585cfa014a0f32233a46267606bd5 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Mon, 8 Sep 2025 10:34:52 +0800 Subject: [PATCH 04/12] Create 1p2d_lb.yaml --- .../k8s/inference/sglang/2p2d/1p2d_lb.yaml | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml new file mode 100644 index 0000000..f9e77bc --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alan-deepseek-lb + namespace: llm + labels: {app: alan-deepseek} +spec: + replicas: 1 + selector: + matchLabels: {app: alan-deepseek, tier: lb} + template: + metadata: + labels: {app: alan-deepseek, tier: lb} + spec: + containers: + - name: sgl-minilb + image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + command: ["python","-m","sglang.srt.disaggregation.mini_lb", + "--prefill","http://alan-deepseek-prefill-svc:30000", + "--decode","http://alan-deepseek-decode-svc:30000", + "--host","0.0.0.0","--port","8000"] + ports: + - containerPort: 8000 + readinessProbe: + tcpSocket: {port: 8000} + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + volumeMounts: + - name: model + mountPath: /data + volumes: + - name: model + hostPath: {path: /data} + +--- +apiVersion: v1 +kind: Service +metadata: + name: alan-deepseek-svc + namespace: llm +spec: + type: ClusterIP + selector: + app: alan-deepseek + tier: lb + ports: + - name: http + protocol: TCP + port: 8000 + targetPort: 8000 From 1a4fb726ad394dab555f9c95cc9bcd62642cbd29 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Mon, 8 Sep 2025 10:35:26 +0800 Subject: [PATCH 05/12] Create svc.yaml --- .../k8s/inference/sglang/2p2d/svc.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml new file mode 100644 index 0000000..c7a4404 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: alan-deepseek-prefill-svc + namespace: llm +spec: + type: ClusterIP + selector: + leaderworkerset.sigs.k8s.io/name: alan-deepseek-prefill + leaderworkerset.sigs.k8s.io/role: leader + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 + - name: nixl-boot + protocol: TCP + port: 8998 + targetPort: 8998 + +--- +apiVersion: v1 +kind: Service +metadata: + name: alan-deepseek-decode-svc + namespace: llm +spec: + type: ClusterIP + selector: + leaderworkerset.sigs.k8s.io/name: alan-deepseek-decode + leaderworkerset.sigs.k8s.io/role: leader + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 From c4fe1d36ab043df2cc64d75f614fc962b4103f2c Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Wed, 10 Sep 2025 11:17:41 +0800 Subject: [PATCH 06/12] Update 1p2d_p.yaml --- .../k8s/inference/sglang/2p2d/1p2d_p.yaml | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml index 2a61d75..cdfc793 100644 --- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml @@ -2,7 +2,7 @@ apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: name: alan-deepseek-prefill - namespace: llm + namespace: t-ai-infra-qqxu03 spec: leaderWorkerTemplate: size: 1 @@ -13,23 +13,23 @@ spec: role: prefill-leader leaderworkerset.sigs.k8s.io/role: leader spec: + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference dnsPolicy: ClusterFirst containers: - name: sglang-leader - image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + image: lmsysorg/sglang:v0.5.1.post2-cu126 command: ["/usr/bin/env","bash","-c"] args: - | set -euxo pipefail export POD_IP="$(hostname -i)" - echo "PREFILL leader on ${POD_IP}:30000" exec python3 -m sglang.launch_server \ --port 30000 \ --host "0.0.0.0" \ - --model-path /data/DeepSeek-V3-0324 \ - --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3 \ + --model-path /data/DeepSeek-R1 \ + --disaggregation-ib-device mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7 \ --chunked-prefill-size 524288 \ - --max-prefill-tokens 32768 \ --page-size 64 \ --disable-radix-cache \ --enable-deepep-moe --deepep-mode normal \ @@ -41,7 +41,7 @@ spec: --trust-remote-code --max-running-requests 1024 \ --disaggregation-transfer-backend nixl env: - # 保留这些网络/通信相关 env;LWS_* 由控制器自动注入,无需自己用 fieldRef 再注入 + # 这些参数可以花时间看看哪些是可以去掉的,一直没太搞明白cc瑞哥 - name: GLOO_SOCKET_IFNAME value: eth0 - name: NCCL_SOCKET_IFNAME @@ -50,8 +50,6 @@ spec: value: "3" - name: NCCL_IB_QPS_PER_CONNECTION value: "8" - - name: NCCL_IB_SPLIT_DATA_ON_QPS - value: "1" - name: NCCL_NET_PLUGIN value: none - name: NCCL_MIN_NCHANNELS @@ -60,12 +58,6 @@ spec: value: "true" - name: SGL_ENABLE_JIT_DEEPGEMM value: "1" - - name: NVSHMEM_ENABLE_NIC_PE_MAPPING - value: "1" - - name: NVSHMEM_IB_GID_INDEX - value: "3" - - name: NVSHMEM_HCA_PE_MAPPING - value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" ports: - containerPort: 30000 - containerPort: 8998 # NIXL bootstrap @@ -84,7 +76,8 @@ spec: - name: dshm emptyDir: { medium: Memory } - name: model - hostPath: { path: /data } + persistentVolumeClaim: + claimName: siflow-models # 1P 不会创建 worker;如不需要可整段删除 workerTemplate: metadata: @@ -93,6 +86,8 @@ spec: role: prefill-worker leaderworkerset.sigs.k8s.io/role: worker spec: + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference containers: - name: noop image: busybox From 54c89eafad6fa89b8dee3d975c30c2effc9fafd8 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Wed, 10 Sep 2025 11:18:08 +0800 Subject: [PATCH 07/12] Update 1p2d_p.yaml --- launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml index cdfc793..bcbc1fb 100644 --- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml @@ -78,7 +78,7 @@ spec: - name: model persistentVolumeClaim: claimName: siflow-models - # 1P 不会创建 worker;如不需要可整段删除 + # 1P 不会创建 worker;如不需要可整段删除,如果多个p需要,因此保留 workerTemplate: metadata: labels: From f9fb6ef4f0ce8afde830f4512d457c560bfca8df Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Wed, 10 Sep 2025 11:18:42 +0800 Subject: [PATCH 08/12] Update 1p2d_d.yaml --- .../k8s/inference/sglang/2p2d/1p2d_d.yaml | 126 ++++++++---------- 1 file changed, 58 insertions(+), 68 deletions(-) diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml index 334d3a4..858a3a4 100644 --- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_d.yaml @@ -2,25 +2,24 @@ apiVersion: leaderworkerset.x-k8s.io/v1 kind: LeaderWorkerSet metadata: name: alan-deepseek-decode - namespace: llm + namespace: t-ai-infra-qqxu03 spec: leaderWorkerTemplate: size: 2 leaderTemplate: metadata: - annotations: - roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' labels: + leaderworkerset.sigs.k8s.io/role: leader role: decode-leader spec: hostNetwork: true hostIPC: true dnsPolicy: ClusterFirstWithHostNet nodeSelector: - deepseek-pool: "true" + "siflow.scitix.ai/resource-pool-name": hisys-inference containers: - name: sglang-leader - image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + image: lmsysorg/sglang:v0.5.1.post2-cu126 securityContext: privileged: true command: ["/usr/bin/env","bash","-c"] @@ -30,44 +29,40 @@ spec: echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ - --model-path /data/DeepSeek-V3-0324 \ + --model-path /data/DeepSeek-R1 \ --chunked-prefill-size "262144" --page-size "64" \ - --enable-dp-attention --enable-dp-lm-head --dp-size "2" \ + --enable-dp-attention --enable-dp-lm-head --dp-size "16" \ --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ --mem-fraction-static "0.849" --context-length "32768" \ - --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ --cuda-graph-max-bs "64" --max-running-requests "2048" \ - --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ - --nnodes 2 --node-rank 0 \ - --trust-remote-code \ - --disaggregation-transfer-backend nixl + --eplb-rebalance-layers-per-chunk "29" \ + --tp-size "16" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes 2 --node-rank 0 --moe-dense-tp-size "1" \ + --trust-remote-code --disaggregation-transfer-backend nixl env: - name: SGLANG_HOST_IP valueFrom: {fieldRef: {fieldPath: status.hostIP}} - name: HOST_IP valueFrom: {fieldRef: {fieldPath: status.hostIP}} - - name: GLOO_SOCKET_IFNAME - value: eth0 - - name: NCCL_SOCKET_IFNAME - value: eth0 - - name: NCCL_IB_GID_INDEX - value: "3" - - name: NVSHMEM_IB_GID_INDEX - value: "3" - - name: NVSHMEM_ENABLE_NIC_PE_MAPPING - value: "1" - - name: NVSHMEM_HCA_PE_MAPPING - value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" - - name: NCCL_IB_QPS_PER_CONNECTION - value: "8" - - name: NCCL_IB_SPLIT_DATA_ON_QPS - value: "1" - - name: CUDA_LAUNCH_BLOCKING - value: "0" - - name: NCCL_NET_PLUGIN - value: "none" - - name: NCCL_MIN_NCHANNELS - value: "4" + # - name: NCCL_IB_GID_INDEX + # value: "3" + # - name: NVSHMEM_IB_GID_INDEX + # value: "3" + # - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + # value: "1" + # - name: NVSHMEM_HCA_PE_MAPPING + # value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + # - name: NCCL_IB_QPS_PER_CONNECTION + # value: "8" + # - name: NCCL_IB_SPLIT_DATA_ON_QPS + # value: "1" + # - name: CUDA_LAUNCH_BLOCKING + # value: "0" + # - name: NCCL_NET_PLUGIN + # value: "none" + # - name: NCCL_MIN_NCHANNELS + # value: "4" - name: MC_TE_METRIC value: "true" - name: SGLANG_MOONCAKE_TRANS_THREAD @@ -95,12 +90,11 @@ spec: - name: dshm emptyDir: { medium: Memory } - name: model - hostPath: { path: /data } + persistentVolumeClaim: + claimName: siflow-models workerTemplate: metadata: - annotations: - roce/gid-injection: '{"envName": "NCCL_IB_GID_INDEX"}' labels: role: decode-worker spec: @@ -108,10 +102,10 @@ spec: hostIPC: true dnsPolicy: ClusterFirstWithHostNet nodeSelector: - deepseek-pool: "true" + "siflow.scitix.ai/resource-pool-name": hisys-inference containers: - name: sglang-worker - image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + image: lmsysorg/sglang:v0.5.1.post2-cu126 securityContext: privileged: true command: ["/usr/bin/env","bash","-c"] @@ -121,44 +115,39 @@ spec: echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" exec python3 -m sglang.launch_server --port "30000" --host "0.0.0.0" \ - --model-path /data/DeepSeek-V3-0324 \ + --model-path /data/DeepSeek-R1 \ --chunked-prefill-size "262144" --page-size "64" \ - --enable-dp-attention --enable-dp-lm-head --dp-size "2" \ + --enable-dp-attention --enable-dp-lm-head --dp-size "16" \ --enable-deepep-moe --deepep-mode low_latency --disaggregation-mode decode \ --mem-fraction-static "0.849" --context-length "32768" \ - --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3" \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ --cuda-graph-max-bs "64" --max-running-requests "2048" \ - --tp-size "8" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --tp-size "16" --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ --nnodes ${LWS_GROUP_SIZE} --node-rank ${LWS_WORKER_INDEX} \ - --trust-remote-code \ - --disaggregation-transfer-backend nixl + --trust-remote-code --moe-dense-tp-size "1" --disaggregation-transfer-backend nixl env: - name: SGLANG_HOST_IP valueFrom: {fieldRef: {fieldPath: status.hostIP}} - name: HOST_IP valueFrom: {fieldRef: {fieldPath: status.hostIP}} - - name: GLOO_SOCKET_IFNAME - value: eth0 - - name: NCCL_SOCKET_IFNAME - value: eth0 - - name: NVSHMEM_IB_TRAFFIC_CLASS - value: "16" - - name: NVSHMEM_IB_GID_INDEX - value: "3" - - name: NCCL_IB_GID_INDEX - value: "3" - - name: NVSHMEM_ENABLE_NIC_PE_MAPPING - value: "1" - - name: NVSHMEM_HCA_PE_MAPPING - value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" - - name: NCCL_IB_QPS_PER_CONNECTION - value: "8" - - name: NCCL_IB_SPLIT_DATA_ON_QPS - value: "1" - - name: NCCL_NET_PLUGIN - value: "none" - - name: NCCL_MIN_NCHANNELS - value: "4" + # - name: NVSHMEM_IB_TRAFFIC_CLASS + # value: "16" + # - name: NVSHMEM_IB_GID_INDEX + # value: "3" + # - name: NCCL_IB_GID_INDEX + # value: "3" + # - name: NVSHMEM_ENABLE_NIC_PE_MAPPING + # value: "1" + # - name: NVSHMEM_HCA_PE_MAPPING + # value: "mlx5_0:1:2,mlx5_1:1:2,mlx5_2:1:2,mlx5_3:1:2" + # - name: NCCL_IB_QPS_PER_CONNECTION + # value: "#8" + # - name: NCCL_IB_SPLIT_DATA_ON_QPS + # value: "1" + # - name: NCCL_NET_PLUGIN + # value: "none#" + # - name: NCCL_MIN_NCHANNELS + # value: "4" - name: MC_TE_METRIC value: "true" - name: SGLANG_MOONCAKE_TRANS_THREAD @@ -186,7 +175,8 @@ spec: - name: dshm emptyDir: { medium: Memory } - name: model - hostPath: { path: /data } + persistentVolumeClaim: + claimName: siflow-models replicas: 1 rolloutStrategy: From 2a107e6c6b8116d92180f6813c1cfa7531e10184 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Wed, 10 Sep 2025 11:19:14 +0800 Subject: [PATCH 09/12] Update 1p2d_lb.yaml --- .../k8s/inference/sglang/2p2d/1p2d_lb.yaml | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml index f9e77bc..9614da9 100644 --- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_lb.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: alan-deepseek-lb - namespace: llm + namespace: t-ai-infra-qqxu03 labels: {app: alan-deepseek} spec: replicas: 1 @@ -12,11 +12,14 @@ spec: metadata: labels: {app: alan-deepseek, tier: lb} spec: + nodeSelector: + "siflow.scitix.ai/resource-pool-name": hisys-inference containers: - name: sgl-minilb - image: sealos.hub:5000/sglang:v0.5.1.post1-cu126 + image: lmsysorg/sglang:v0.5.1.post2-cu126 command: ["python","-m","sglang.srt.disaggregation.mini_lb", "--prefill","http://alan-deepseek-prefill-svc:30000", + "--prefill-bootstrap-ports","8998", "--decode","http://alan-deepseek-decode-svc:30000", "--host","0.0.0.0","--port","8000"] ports: @@ -31,14 +34,15 @@ spec: mountPath: /data volumes: - name: model - hostPath: {path: /data} + persistentVolumeClaim: + claimName: siflow-models --- apiVersion: v1 kind: Service metadata: name: alan-deepseek-svc - namespace: llm + namespace: t-ai-infra-qqxu03 spec: type: ClusterIP selector: From 210bd008426310fa7829372cb3c21487d736f231 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Wed, 10 Sep 2025 11:19:45 +0800 Subject: [PATCH 10/12] Update svc.yaml --- launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml index c7a4404..0750ccd 100644 --- a/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml +++ b/launcher_scripts/k8s/inference/sglang/2p2d/svc.yaml @@ -3,7 +3,7 @@ apiVersion: v1 kind: Service metadata: name: alan-deepseek-prefill-svc - namespace: llm + namespace: t-ai-infra-qqxu03 spec: type: ClusterIP selector: @@ -24,7 +24,7 @@ apiVersion: v1 kind: Service metadata: name: alan-deepseek-decode-svc - namespace: llm + namespace: t-ai-infra-qqxu03 spec: type: ClusterIP selector: From 30c51f0874dbb78dc8329d516a4538149df8de96 Mon Sep 17 00:00:00 2001 From: ziang663 <119752791+ziang663@users.noreply.github.com> Date: Wed, 10 Sep 2025 14:40:56 +0800 Subject: [PATCH 11/12] Update 1p2d_p.yaml --- .../k8s/inference/sglang/2p2d/1p2d_p.yaml | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml index bcbc1fb..18b2831 100644 --- a/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml +++ b/launcher_scripts/k8s/inference/sglang/2p2d/1p2d_p.yaml @@ -13,9 +13,11 @@ spec: role: prefill-leader leaderworkerset.sigs.k8s.io/role: leader spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet nodeSelector: "siflow.scitix.ai/resource-pool-name": hisys-inference - dnsPolicy: ClusterFirst containers: - name: sglang-leader image: lmsysorg/sglang:v0.5.1.post2-cu126 @@ -41,11 +43,14 @@ spec: --trust-remote-code --max-running-requests 1024 \ --disaggregation-transfer-backend nixl env: - # 这些参数可以花时间看看哪些是可以去掉的,一直没太搞明白cc瑞哥 + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} - name: GLOO_SOCKET_IFNAME - value: eth0 + value: bond0 - name: NCCL_SOCKET_IFNAME - value: eth0 + value: bond0 - name: NCCL_IB_GID_INDEX value: "3" - name: NCCL_IB_QPS_PER_CONNECTION @@ -78,7 +83,6 @@ spec: - name: model persistentVolumeClaim: claimName: siflow-models - # 1P 不会创建 worker;如不需要可整段删除,如果多个p需要,因此保留 workerTemplate: metadata: labels: @@ -92,3 +96,4 @@ spec: - name: noop image: busybox command: ["sh","-c","sleep 3600000"] + From f1c189b5c5628cb5048f03a8ba7eb0a680346185 Mon Sep 17 00:00:00 2001 From: jrshe Date: Mon, 15 Sep 2025 07:43:04 +0000 Subject: [PATCH 12/12] Add 1p1d yamls --- .../k8s/inference/sglang/1p1d/d.yaml | 176 ++++++++++++++++++ .../k8s/inference/sglang/1p1d/lb.yaml | 61 ++++++ .../k8s/inference/sglang/1p1d/p.yaml | 85 +++++++++ .../k8s/inference/sglang/1p1d/svc.yaml | 35 ++++ 4 files changed, 357 insertions(+) create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/d.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/p.yaml create mode 100644 launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml new file mode 100644 index 0000000..a2d23b0 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/d.yaml @@ -0,0 +1,176 @@ +apiVersion: leaderworkerset.x-k8s.io/v1 +kind: LeaderWorkerSet +metadata: + name: deepseek-decode + namespace: default +spec: + leaderWorkerTemplate: + size: 2 + leaderTemplate: + metadata: + labels: + leaderworkerset.sigs.k8s.io/role: leader + role: decode-leader + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "scitix.ai/gpu-type": h20xnvlink141 + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-leader + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server \ + --host "0.0.0.0" \ + --port 30000 \ + --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \ + --chunked-prefill-size 20480 \ + --page-size 64 \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --disaggregation-mode decode \ + --mem-fraction-static 0.85 \ + --context-length 32768 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --cuda-graph-max-bs 64 \ + --max-running-requests 2048 \ + --eplb-rebalance-layers-per-chunk 29 \ + --tp-size 16 \ + --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes 2 \ + --node-rank 0 \ + --moe-dense-tp-size 1 \ + --trust-remote-code \ + --disaggregation-transfer-backend nixl \ + --enable-dp-attention \ + --enable-dp-lm-head \ + --dp-size 8 + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: MC_TE_METRIC + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 100 + timeoutSeconds: 300 + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt + name: mnt + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: mnt + hostPath: + path: /mnt/xstorage + + workerTemplate: + metadata: + labels: + role: decode-worker + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "scitix.ai/gpu-type": h20xnvlink141 + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-worker + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env","bash","-c"] + args: + - |- + echo "LWS_LEADER_ADDRESS=$LWS_LEADER_ADDRESS" + echo "LWS_GROUP_SIZE=$LWS_GROUP_SIZE" + echo "LWS_WORKER_INDEX=$LWS_WORKER_INDEX" + exec python3 -m sglang.launch_server \ + --host "0.0.0.0" \ + --port 30000 \ + --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \ + --chunked-prefill-size 20480 \ + --page-size 64 \ + --enable-deepep-moe \ + --deepep-mode low_latency \ + --disaggregation-mode decode \ + --mem-fraction-static 0.85 \ + --context-length 32768 \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --cuda-graph-max-bs 64 \ + --max-running-requests 2048 \ + --tp-size 16 \ + --dist-init-addr ${LWS_LEADER_ADDRESS}.svc.cluster.local:20102 \ + --nnodes ${LWS_GROUP_SIZE} \ + --node-rank ${LWS_WORKER_INDEX} \ + --trust-remote-code \ + --moe-dense-tp-size 1 \ + --disaggregation-transfer-backend nixl \ + --enable-dp-attention \ + --enable-dp-lm-head \ + --dp-size 8 + + env: + - name: SGLANG_HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: HOST_IP + valueFrom: {fieldRef: {fieldPath: status.hostIP}} + - name: MC_TE_METRIC + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + protocol: TCP + readinessProbe: + initialDelaySeconds: 100 + timeoutSeconds: 300 + periodSeconds: 30 + tcpSocket: + port: 30000 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - mountPath: /dev/shm + name: dshm + - mountPath: /mnt + name: mnt + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: mnt + hostPath: + path: /mnt/xstorage + + replicas: 1 + rolloutStrategy: + type: RollingUpdate + rollingUpdateConfiguration: + maxSurge: 0 + maxUnavailable: 1 + startupPolicy: LeaderCreated \ No newline at end of file diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml new file mode 100644 index 0000000..5cc16ce --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/lb.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseek-lb + namespace: default + labels: {app: deepseek, tier: lb} +spec: + replicas: 1 + selector: + matchLabels: {app: deepseek, tier: lb} + template: + metadata: + labels: {app: deepseek, tier: lb} + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-minilb + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + exec python -m sglang.srt.disaggregation.mini_lb \ + --prefill "http://deepseek-prefill-svc:30000" \ + --decode "http://deepseek-decode-svc:30000" \ + --host "0.0.0.0" \ + --port 8000 + ports: + - containerPort: 8000 + name: http + readinessProbe: + tcpSocket: {port: 8000} + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + volumeMounts: + - name: mnt + mountPath: /mnt + volumes: + - name: mnt + hostPath: + path: /mnt/xstorage +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseek-svc + namespace: default +spec: + type: ClusterIP + selector: + app: deepseek + tier: lb + ports: + - name: http + protocol: TCP + port: 8000 + targetPort: 8000 diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml new file mode 100644 index 0000000..4d47e37 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/p.yaml @@ -0,0 +1,85 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: deepseek-prefill + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: deepseek-prefill + template: + metadata: + labels: + app: deepseek-prefill + spec: + hostNetwork: true + hostIPC: true + dnsPolicy: ClusterFirstWithHostNet + nodeSelector: + "scitix.ai/gpu-type": h20xnvlink141 + "roce.scitix.ai/unit": unit1 + containers: + - name: sglang-prefill + image: registry-cn-shanghai.siflow.cn/hisys/sglang:v0.5.1.post2-cu126 + securityContext: + privileged: true + command: ["/usr/bin/env", "bash", "-c"] + args: + - | + set -euxo pipefail + export POD_IP="$(hostname -i)" + exec python3 -m sglang.launch_server \ + --host "0.0.0.0" \ + --port 30000 \ + --model-path /mnt/models/deepseek-ai/DeepSeek-V3.1-Base \ + --disaggregation-ib-device "mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7" \ + --chunked-prefill-size 20480 \ + --page-size 64 \ + --disable-radix-cache \ + --enable-deepep-moe \ + --deepep-mode normal \ + --disaggregation-mode prefill \ + --mem-fraction-static 0.85 \ + --context-length 32768 \ + --tp-size 8 \ + --trust-remote-code \ + --max-running-requests 1024 \ + --disaggregation-transfer-backend nixl + env: + - name: GLOO_SOCKET_IFNAME + value: bond0 + - name: NCCL_SOCKET_IFNAME + value: bond0 + - name: NCCL_IB_GID_INDEX + value: "3" + - name: NCCL_IB_QPS_PER_CONNECTION + value: "8" + - name: NCCL_NET_PLUGIN + value: none + - name: NCCL_MIN_NCHANNELS + value: "4" + - name: SGLANG_SET_CPU_AFFINITY + value: "true" + - name: SGL_ENABLE_JIT_DEEPGEMM + value: "1" + ports: + - containerPort: 30000 + - containerPort: 8998 + readinessProbe: + tcpSocket: { port: 30000 } + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + resources: + limits: + nvidia.com/gpu: "8" + volumeMounts: + - { name: dshm, mountPath: /dev/shm } + - { name: mnt, mountPath: /mnt } + volumes: + - name: dshm + emptyDir: { medium: Memory } + - name: mnt + hostPath: + path: /mnt/xstorage \ No newline at end of file diff --git a/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml b/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml new file mode 100644 index 0000000..e1db8e3 --- /dev/null +++ b/launcher_scripts/k8s/inference/sglang/1p1d/svc.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: Service +metadata: + name: deepseek-prefill-svc + namespace: default +spec: + type: ClusterIP + selector: + app: deepseek-prefill + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 + - name: nixl-boot + protocol: TCP + port: 8998 + targetPort: 8998 + +--- +apiVersion: v1 +kind: Service +metadata: + name: deepseek-decode-svc + namespace: default +spec: + type: ClusterIP + selector: + leaderworkerset.sigs.k8s.io/name: deepseek-decode + leaderworkerset.sigs.k8s.io/role: leader + ports: + - name: http + protocol: TCP + port: 30000 + targetPort: 30000 \ No newline at end of file