From 4d56166a8364aa5bf1eef70d2aa1419971110a60 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Thu, 25 Dec 2025 00:26:14 +0000 Subject: [PATCH 1/5] initial commit for a4x dynamo deepseek-fp8 2p2d recipe --- .../disaggregated-serving/dynamo/README.md | 28 ++ .../disaggregated-serving/dynamo/values.yaml | 197 ++++++++ .../deepseekr1-fp8-multi-node-decode.yaml | 46 ++ .../deepseekr1-fp8-multi-node-prefill.yaml | 46 ++ .../dynamo-deployment/Chart.yaml | 20 + .../templates/dynamo-compute-domain.yaml | 24 + .../templates/dynamo-graph-deployment.yaml | 470 ++++++++++++++++++ .../templates/dynamo-launcher-configmap.yaml | 28 ++ .../templates/dynamo-worker-configmap.yaml | 35 ++ 9 files changed, 894 insertions(+) create mode 100644 inference/a4x/disaggregated-serving/dynamo/README.md create mode 100644 inference/a4x/disaggregated-serving/dynamo/values.yaml create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml create mode 100644 src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md new file mode 100644 index 00000000..ec499d10 --- /dev/null +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -0,0 +1,28 @@ +# Disaggregated Multi-Node Dynamo Recipe for A4x + +This recipe runs a disaggregated multi-node Dynamo deployment on A4x. + +## Setup + +1. **Set Environment Variables** + + ```bash + export REPO_ROOT=$(git rev-parse --show-toplevel) + export RELEASE_VERSION="24.05" + export USER=$(whoami) + ``` + +2. **Run the Recipe** + + ```bash + helm install -f values.yaml \ + --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-vllm-launcher.sh \ + --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/llama-3.3-70b-multi-node.yaml \ + --set workload.framework=vllm \ + --set workload.model.name=meta-llama/Llama-3.3-70B-Instruct \ + --set workload.image=nvcr.io/nvidia/ai-dynamo/vllm-runtime:${RELEASE_VERSION} \ + --set workload.gpus=16 \ + $USER-dynamo-multi-node-serving-a4x \ + $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment + ``` + diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml new file mode 100644 index 00000000..b49162bc --- /dev/null +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -0,0 +1,197 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +dynamo: + namespace: dynamo-cloud + releaseVersion: "0.7.0" + deploymentName: + computeDomain: + name: yijiaj-a4x-domain + numNodes: 4 + resourceClaimTemplateName: yijiaj-a4x-channel + frontend: + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1 + replicas: 1 + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 150 + failureThreshold: 100 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 300 + failureThreshold: 100 + decodeWorker: + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + nodeCount: 2 + replicas: 1 + envs: + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_DECODE_BOOTSTRAP_TIMEOUT + value: "1000" + - name: SGLANG_HACK_SEQ_BOOTSTRAP_ROOM + value: "1" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: PYTHONUNBUFFERED + value: "1" + - name: NCCL_DEBUG + value: INFO + - name: NCCL_DEBUG_SUBSYS + value: INIT,BOOTSTRAP,ENV,NET,GRAPH + - name: NCCL_SOCKET_FAMILY + value: "AF_INET" + - name: GLOO_SOCKET_FAMILY + value: "AF_INET" + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 150 + failureThreshold: 100 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 300 + failureThreshold: 100 + startupProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 600 + failureThreshold: 3000 + prefillWorker: + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + nodeCount: 2 + replicas: 1 + envs: + - name: LD_LIBRARY_PATH + value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" + - name: UCX_TLS + value: "^tcp" + - name: GLOO_SOCKET_IFNAME + value: eth0 + - name: TP_SOCKET_IFNAME + value: eth0 + - name: SGLANG_ENABLE_JIT_DEEPGEMM + value: "1" + - name: DYN_SKIP_SGLANG_LOG_FORMATTING + value: "1" + - name: MC_TE_METRIC + value: "true" + - name: SGLANG_ENABLE_FLASHINFER_GEMM + value: "1" + - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE + value: "100000" + - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT + value: "100000" + - name: SGLANG_DISAGGREGATION_WAITING_TIMEOUT + value: "100000" + - name: SGLANG_MOONCAKE_CUSTOM_MEM_POOL + value: "True" + - name: MC_FORCE_MNNVL + value: "1" + - name: NCCL_MNNVL_ENABLE + value: "1" + - name: NCCL_CUMEM_ENABLE + value: "1" + - name: SGLANG_USE_MESSAGE_QUEUE_BROADCASTER + value: "0" + - name: SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK + value: "1" + - name: PYTHONUNBUFFERED + value: "1" + livenessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 150 + failureThreshold: 100 + readinessProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 300 + failureThreshold: 100 + startupProbe: + initialDelaySeconds: 3000 + periodSeconds: 60 + timeoutSeconds: 600 + failureThreshold: 3000 + + +secrets: + ngc: + secretName: nvcr-secret + huggingface: + secretName: hf-token-secret + secretData: + token: "hf_api_token" + +volumes: + gcsfuse: + bucketName: "yijiaj-test" + fileCacheCapacity: "500G" + cachePath: "/gcs-cache" + ssdMountPath: "/ssd" + gcsMounts: + - bucketName: "yijiaj-test" + mountPath: "/data/model" + +service: + type: ClusterIP + ports: + frontend: 8000 + worker: 9090 + +workload: + model: deepseek-ai/DeepSeek-R1 + gpus: 16 + framework: sglang + +network: + subnetworks: [] + gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib-diagnostic-arm64:v1.0.7 + ncclSettings: + - name: NCCL_DEBUG + value: "VERSION" + +quantizations: + - "fp8" diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml new file mode 100644 index 00000000..82029f49 --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model-path: /data/model/deepseek-ai/DeepSeek-R1 +served-model-name: deepseek-ai/DeepSeek-R1 +log-level: DEBUG +tp: "8" +dp-size: "8" +decode-log-interval: "1" +page-size: "1" +enable-dp-attention: true +trust-remote-code: true +disaggregation-mode: decode +disaggregation-transfer-backend: nixl +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +max-running-requests: "36864" +context-length: "2716" +disable-radix-cache: true +moe-a2a-backend: deepep +prefill-round-robin-balance: true +deepep-mode: normal +moe-dense-tp-size: "1" +enable-dp-lm-head: true +disable-cuda-graph: true +cuda-graph-max-bs: "256" +disable-shared-experts-fusion: true +ep-num-redundant-experts: "32" +ep-dispatch-algorithm: static +eplb-algorithm: deepseek +attention-backend: cutlass_mla +watchdog-timeout: "1000000" +chunked-prefill-size: "36864" +mem-fraction-static: "0.8" diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml new file mode 100644 index 00000000..939aa2cc --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml @@ -0,0 +1,46 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +model-path: /data/model/deepseek-ai/DeepSeek-R1 +served-model-name: deepseek-ai/DeepSeek-R1 +log-level: DEBUG +tp: "8" +dp-size: "8" +trust-remote-code: true +decode-log-interval: "1" +page-size: "1" +enable-dp-attention: true +disaggregation-mode: prefill +disaggregation-transfer-backend: nixl +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +max-running-requests: "6144" +context-length: "2716" +disable-radix-cache: true +moe-a2a-backend: deepep +load-balance-method: round_robin +deepep-mode: normal +moe-dense-tp-size: "1" +enable-dp-lm-head: true +disable-shared-experts-fusion: true +ep-num-redundant-experts: "32" +ep-dispatch-algorithm: static +eplb-algorithm: deepseek +attention-backend: cutlass_mla +watchdog-timeout: "1000000" +disable-cuda-graph: true +chunked-prefill-size: "16384" +max-total-tokens: "32768" +mem-fraction-static: "0.8" diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml new file mode 100644 index 00000000..25a2209e --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/Chart.yaml @@ -0,0 +1,20 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v2 +name: a4x-dynamo-deployment +description: a4x-dynamo-deployment +type: application +version: 0.1.0 +appVersion: "0.4.0" \ No newline at end of file diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml new file mode 100644 index 00000000..dc2ab53a --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-compute-domain.yaml @@ -0,0 +1,24 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + name: {{ .Values.dynamo.computeDomain.name }} + namespace: {{ .Values.dynamo.namespace }} +spec: + numNodes: {{ .Values.dynamo.computeDomain.numNodes }} + channel: + resourceClaimTemplate: + name: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml new file mode 100644 index 00000000..efe0306d --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -0,0 +1,470 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: {{ .Values.dynamo.deploymentName }} + namespace: {{ .Values.dynamo.namespace }} +spec: + {{- if .Values.workload.framework }} + backendFramework: {{ .Values.workload.framework }} + {{- end }} + services: + Frontend: + dynamoNamespace: {{ .Values.dynamo.namespace }} + componentType: frontend + replicas: {{ .Values.dynamo.frontend.replicas }} + resources: + requests: + cpu: "5" + memory: "10Gi" + limits: + cpu: "5" + memory: "10Gi" + extraPodMetadata: + annotations: + gke-gcsfuse/volumes: "true" + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/file-cache-capacity: "500Gi" + gke-gcsfuse/cache-path: "/gcs-cache" + extraPodSpec: + tolerations: + - key: "kubernetes.io/arch" + operator: "Equal" + value: "arm64" + effect: "NoSchedule" + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: local-ssd + emptyDir: {} + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:50,file-cache:max-size-mb:-1" + + mainContainer: + image: {{ .Values.dynamo.frontend.image }} + volumeMounts: + - name: local-ssd + mountPath: /gcs-cache + - name: gcs-model-volume + mountPath: /data/model + readOnly: true + resources: + requests: + ephemeral-storage: "30Gi" + limits: + ephemeral-storage: "30Gi" + + Decode: + multinode: + nodeCount: {{ .Values.dynamo.decodeWorker.nodeCount }} + dynamoNamespace: {{ .Values.dynamo.namespace }} + envFromSecret: {{ .Values.secrets.huggingface.secretName }} + componentType: worker + subComponentType: decode + replicas: {{ .Values.dynamo.decodeWorker.replicas }} + livenessProbe: + httpGet: + path: /live + port: system + initialDelaySeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.decodeWorker.livenessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.dynamo.decodeWorker.livenessProbe.failureThreshold }} + readinessProbe: + httpGet: + path: /health + port: system + initialDelaySeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.initialDelaySeconds }} + timeoutSeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.timeoutSeconds }} + periodSeconds: {{ .Values.dynamo.decodeWorker.readinessProbe.periodSeconds }} + failureThreshold: {{ .Values.dynamo.decodeWorker.readinessProbe.failureThreshold }} + sharedMemory: + size: 80Gi + resources: + resources: + limits: + gpu: "4" + claims: + - name: compute-domain-channel + envs: + {{- if .Values.dynamo.decodeWorker.envs }} + {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }} + {{- end }} + extraPodMetadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth2","network":"rdma-0"}, + {"interfaceName":"eth3","network":"rdma-1"}, + {"interfaceName":"eth4","network":"rdma-2"}, + {"interfaceName":"eth5","network":"rdma-3"} + ] + extraPodSpec: + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - arm64 + volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + + mainContainer: + securityContext: + privileged: true + image: {{ .Values.dynamo.decodeWorker.image }} + workingDir: /sgl-workspace/dynamo/components/backends/sglang + startupProbe: + failureThreshold: {{ .Values.dynamo.decodeWorker.startupProbe.failureThreshold }} + httpGet: + path: /live + port: system + periodSeconds: {{ .Values.dynamo.decodeWorker.startupProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.decodeWorker.startupProbe.timeoutSeconds }} + initialDelaySeconds: {{ .Values.dynamo.decodeWorker.startupProbe.initialDelaySeconds }} + command: ["/bin/bash", "-c"] + stdin: true + tty: true + args: + - | + set -e + nvidia-smi + . /usr/local/gib/scripts/set_nccl_env.sh + + echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" + env | grep NCCL_ + echo "--- END VERIFICATION ---" + + {{- if .Values.workload_launcher }} + # Use custom launcher if provided + if [ ! -f "$LAUNCHER_SCRIPT" ]; then + echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" + exit 1 + fi + + ARGS=() + if [ -f "$SERVER_ARGS_FILE" ]; then + echo "Loading server arguments from ConfigMap" + while IFS=': ' read -r key value || [ -n "$key" ]; do + [[ -z "$key" || "$key" == \#* ]] && continue + key=$(echo "$key" | xargs) + value=$(echo "$value" | xargs) + + if [ -n "$key" ]; then + if [[ "$value" == "true" ]]; then + ARGS+=("--$key") + elif [[ "$value" == "false" ]]; then + ARGS+=("--$key" "false") + elif [ -n "$value" ]; then + ARGS+=("--$key" "$value") + else + ARGS+=("--$key") + fi + fi + done < "$SERVER_ARGS_FILE" + fi + + echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" + exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" + {{- else }} + exec python3 -m dynamo.sglang \ + --model-path /data/model/deepseek-ai/DeepSeek-R1 \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --log-level DEBUG \ + --tp 8 \ + --dp-size 8 \ + --decode-log-interval 1 \ + --page-size 1 \ + --enable-dp-attention \ + --trust-remote-code \ + --disaggregation-mode decode \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ + --port 9090 \ + --decode-log-interval 1 \ + --max-running-requests 36864 \ + --context-length 2716 \ + --disable-radix-cache \ + --moe-a2a-backend deepep \ + --prefill-round-robin-balance \ + --deepep-mode normal \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-cuda-graph \ + --cuda-graph-max-bs 256 \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --chunked-prefill-size 36864 \ + --mem-fraction-static 0.8 + {{- end }} + + volumeMounts: + - mountPath: /data/model + name: gcs-model-volume + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + {{- if .Values.workload_launcher }} + - name: serving-configuration + mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} + - name: serving-launcher + mountPath: /workload/launcher + {{- end }} + volumes: + {{- if .Values.workload_launcher }} + - name: serving-configuration + configMap: + name: "{{ .Release.Name }}-decode-config" + items: + - key: serving-configuration + path: {{ .Values.workload.configFile | default "serving-args.yaml" }} + - name: serving-launcher + configMap: + name: "{{ .Release.Name }}-launcher" + defaultMode: 0700 + {{- end }} + + + Prefill: + multinode: + nodeCount: {{ .Values.dynamo.prefillWorker.nodeCount }} + dynamoNamespace: {{ .Values.dynamo.namespace }} + envFromSecret: {{ .Values.secrets.huggingface.secretName }} + componentType: worker + subComponentType: prefill + replicas: {{ .Values.dynamo.prefillWorker.replicas }} + livenessProbe: + httpGet: + path: /live + port: system + initialDelaySeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.initialDelaySeconds }} + periodSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.timeoutSeconds }} + failureThreshold: {{ .Values.dynamo.prefillWorker.livenessProbe.failureThreshold }} + readinessProbe: + httpGet: + path: /health + port: system + initialDelaySeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.initialDelaySeconds }} + timeoutSeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.timeoutSeconds }} + periodSeconds: {{ .Values.dynamo.prefillWorker.readinessProbe.periodSeconds }} + failureThreshold: {{ .Values.dynamo.prefillWorker.readinessProbe.failureThreshold }} + sharedMemory: + size: 80Gi + resources: + limits: + gpu: "4" + claims: + - name: compute-domain-channel + envs: + {{- if .Values.dynamo.prefillWorker.envs }} + {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }} + {{- end }} + extraPodMetadata: + annotations: + gke-gcsfuse/cpu-limit: "0" + gke-gcsfuse/ephemeral-storage-limit: "0" + gke-gcsfuse/memory-limit: "0" + gke-gcsfuse/volumes: "true" + networking.gke.io/default-interface: 'eth0' + networking.gke.io/interfaces: | + [ + {"interfaceName":"eth0","network":"default"}, + {"interfaceName":"eth2","network":"rdma-0"}, + {"interfaceName":"eth3","network":"rdma-1"}, + {"interfaceName":"eth4","network":"rdma-2"}, + {"interfaceName":"eth5","network":"rdma-3"} + ] + extraPodSpec: + resourceClaims: + - name: compute-domain-channel + resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/arch + operator: In + values: + - arm64 + volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib + mainContainer: + securityContext: + privileged: true + stdin: true + tty: true + image: {{ .Values.dynamo.prefillWorker.image }} + workingDir: /sgl-workspace/dynamo/components/backends/sglang + startupProbe: + failureThreshold: {{ .Values.dynamo.prefillWorker.startupProbe.failureThreshold }} + httpGet: + path: /live + port: system + periodSeconds: {{ .Values.dynamo.prefillWorker.startupProbe.periodSeconds }} + timeoutSeconds: {{ .Values.dynamo.prefillWorker.startupProbe.timeoutSeconds }} + initialDelaySeconds: {{ .Values.dynamo.prefillWorker.startupProbe.initialDelaySeconds }} + command: ["/bin/bash", "-c"] + args: + - | + set -e + nvidia-smi + . /usr/local/gib/scripts/set_nccl_env.sh + echo "Pre-compiling DeepGEMM kernels for Prefill Worker..." + + echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker." + {{- if .Values.workload_launcher }} + # Use custom launcher if provided + if [ ! -f "$LAUNCHER_SCRIPT" ]; then + echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" + exit 1 + fi + + ARGS=("--is-prefill-worker") + if [ -f "$SERVER_ARGS_FILE" ]; then + echo "Loading server arguments from ConfigMap" + while IFS=': ' read -r key value || [ -n "$key" ]; do + [[ -z "$key" || "$key" == \#* ]] && continue + key=$(echo "$key" | xargs) + value=$(echo "$value" | xargs) + + if [ -n "$key" ]; then + if [[ "$value" == "true" ]]; then + ARGS+=("--$key") + elif [[ "$value" == "false" ]]; then + ARGS+=("--$key" "false") + elif [ -n "$value" ]; then + ARGS+=("--$key" "$value") + else + ARGS+=("--$key") + fi + fi + done < "$SERVER_ARGS_FILE" + fi + + echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" + exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" + {{- else }} + exec python3 -m dynamo.sglang \ + --model-path /data/model/deepseek-ai/DeepSeek-R1 \ + --served-model-name deepseek-ai/DeepSeek-R1 \ + --log-level DEBUG \ + --tp 8 \ + --dp-size 8 \ + --trust-remote-code \ + --decode-log-interval 1 \ + --page-size 1 \ + --enable-dp-attention \ + --disaggregation-mode prefill \ + --disaggregation-transfer-backend nixl \ + --disaggregation-bootstrap-port 30001 \ + --host 0.0.0.0 \ + --port 9090 \ + --decode-log-interval 1 \ + --max-running-requests 6144 \ + --context-length 2716 \ + --disable-radix-cache \ + --moe-a2a-backend deepep \ + --load-balance-method round_robin \ + --deepep-mode normal \ + --moe-dense-tp-size 1 \ + --enable-dp-lm-head \ + --disable-shared-experts-fusion \ + --ep-num-redundant-experts 32 \ + --ep-dispatch-algorithm static \ + --eplb-algorithm deepseek \ + --attention-backend cutlass_mla \ + --watchdog-timeout 1000000 \ + --disable-cuda-graph \ + --chunked-prefill-size 16384 \ + --max-total-tokens 32768 \ + --mem-fraction-static 0.8 + {{- end }} + + volumeMounts: + - mountPath: /data/model + name: gcs-model-volume + - name: library-dir-host + mountPath: /usr/local/nvidia + - name: gib + mountPath: /usr/local/gib + {{- if .Values.workload_launcher }} + - name: serving-configuration + mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} + - name: serving-launcher + mountPath: /workload/launcher + {{- end }} + volumes: + {{- if .Values.workload_launcher }} + - name: serving-configuration + configMap: + name: "{{ .Release.Name }}-prefill-config" + items: + - key: serving-configuration + path: {{ .Values.workload.configFile | default "serving-args.yaml" }} + - name: serving-launcher + configMap: + name: "{{ .Release.Name }}-launcher" + defaultMode: 0700 + {{- end }} diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml new file mode 100644 index 00000000..01e9b51f --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-launcher-configmap.yaml @@ -0,0 +1,28 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-launcher" + namespace: {{ .Values.dynamo.namespace }} +data: + launch-workload.sh: |- +{{- if .Values.workload_launcher }} +{{ .Values.workload_launcher | nindent 4 }} +{{- else }} + #!/bin/bash + echo "No workload launcher specified" + exit 1 +{{- end }} \ No newline at end of file diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml new file mode 100644 index 00000000..f82580ae --- /dev/null +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-worker-configmap.yaml @@ -0,0 +1,35 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{{- if .Values.prefill_serving_config }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-prefill-config" + namespace: {{ .Values.dynamo.namespace }} +data: + serving-configuration: |- +{{ .Values.prefill_serving_config | nindent 4 }} +{{- end }} +--- +{{- if .Values.decode_serving_config }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-decode-config" + namespace: {{ .Values.dynamo.namespace }} +data: + serving-configuration: |- +{{ .Values.decode_serving_config | nindent 4 }} +{{- end }} \ No newline at end of file From 42b686d07d346cfdd7e27886dfe79dd6e98b5e64 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Sat, 27 Dec 2025 00:01:31 +0000 Subject: [PATCH 2/5] fix values --- .../disaggregated-serving/dynamo/README.md | 24 ++++++++----------- .../disaggregated-serving/dynamo/values.yaml | 2 +- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index ec499d10..9a5f91e6 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -1,28 +1,24 @@ # Disaggregated Multi-Node Dynamo Recipe for A4x -This recipe runs a disaggregated multi-node Dynamo deployment on A4x. +This recipe runs a disaggregated multi-node Dynamo deployment on A4X. ## Setup 1. **Set Environment Variables** ```bash - export REPO_ROOT=$(git rev-parse --show-toplevel) - export RELEASE_VERSION="24.05" export USER=$(whoami) ``` 2. **Run the Recipe** - ```bash - helm install -f values.yaml \ - --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-vllm-launcher.sh \ - --set-file serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/llama-3.3-70b-multi-node.yaml \ - --set workload.framework=vllm \ - --set workload.model.name=meta-llama/Llama-3.3-70B-Instruct \ - --set workload.image=nvcr.io/nvidia/ai-dynamo/vllm-runtime:${RELEASE_VERSION} \ - --set workload.gpus=16 \ - $USER-dynamo-multi-node-serving-a4x \ - $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment - ``` + ```bash + cd $RECIPE_ROOT + helm install -f values.yaml \ + --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-sglang-launcher.sh \ + --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \ + --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \ + $USER-dynamo-a4x-multi-node \ + $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment + ``` diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml index b49162bc..71f43c74 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -15,7 +15,7 @@ dynamo: namespace: dynamo-cloud releaseVersion: "0.7.0" - deploymentName: + deploymentName: disagg2p2d-yijiaj computeDomain: name: yijiaj-a4x-domain numNodes: 4 From 3dfc415947c9b42b1036115b9457c1672bbffc1c Mon Sep 17 00:00:00 2001 From: Yijia J Date: Sat, 27 Dec 2025 00:28:09 +0000 Subject: [PATCH 3/5] update --- .../templates/dynamo-graph-deployment.yaml | 52 +++++++++---------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index efe0306d..cb9fbbf0 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -101,7 +101,6 @@ spec: sharedMemory: size: 80Gi resources: - resources: limits: gpu: "4" claims: @@ -138,20 +137,6 @@ spec: operator: In values: - arm64 - volumes: - - name: gcs-model-volume - csi: - driver: gcsfuse.csi.storage.gke.io - volumeAttributes: - bucketName: {{ .Values.volumes.gcsfuse.bucketName }} - mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 - - name: library-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: gib - hostPath: - path: /home/kubernetes/bin/gib - mainContainer: securityContext: privileged: true @@ -260,6 +245,18 @@ spec: mountPath: /workload/launcher {{- end }} volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib {{- if .Values.workload_launcher }} - name: serving-configuration configMap: @@ -337,19 +334,6 @@ spec: operator: In values: - arm64 - volumes: - - name: gcs-model-volume - csi: - driver: gcsfuse.csi.storage.gke.io - volumeAttributes: - bucketName: {{ .Values.volumes.gcsfuse.bucketName }} - mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 - - name: library-dir-host - hostPath: - path: /home/kubernetes/bin/nvidia - - name: gib - hostPath: - path: /home/kubernetes/bin/gib mainContainer: securityContext: privileged: true @@ -456,6 +440,18 @@ spec: mountPath: /workload/launcher {{- end }} volumes: + - name: gcs-model-volume + csi: + driver: gcsfuse.csi.storage.gke.io + volumeAttributes: + bucketName: {{ .Values.volumes.gcsfuse.bucketName }} + mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + - name: library-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: gib + hostPath: + path: /home/kubernetes/bin/gib {{- if .Values.workload_launcher }} - name: serving-configuration configMap: From 36ccdb63e6b5444cc722654dc890070290fb00a9 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Thu, 1 Jan 2026 00:25:42 +0000 Subject: [PATCH 4/5] recipe 2p2d, README --- .../disaggregated-serving/dynamo/README.md | 302 +++++++++++++++++- .../disaggregated-serving/dynamo/values.yaml | 8 +- .../templates/dynamo-graph-deployment.yaml | 125 +------- 3 files changed, 300 insertions(+), 135 deletions(-) diff --git a/inference/a4x/disaggregated-serving/dynamo/README.md b/inference/a4x/disaggregated-serving/dynamo/README.md index 9a5f91e6..b185b4fb 100644 --- a/inference/a4x/disaggregated-serving/dynamo/README.md +++ b/inference/a4x/disaggregated-serving/dynamo/README.md @@ -1,24 +1,292 @@ -# Disaggregated Multi-Node Dynamo Recipe for A4x +# Disaggregated Multi-Node Inference with NVIDIA Dynamo on A4X GKE -This recipe runs a disaggregated multi-node Dynamo deployment on A4X. +This document outlines the steps to deploy and serve Large Language Models (LLMs) using [NVIDIA Dynamo](https://github.com/ai-dynamo/dynamo) disaggregated inference platform on [A4X GKE Node pools](https://cloud.google.com/kubernetes-engine). -## Setup +Dynamo provides a disaggregated architecture that separates prefill and decode operations for optimized inference performance, supporting both single-node (8 GPUs) and multi-node (16 GPUs) configurations. Dynamo also supports various inference framework backends like [vLLM](https://docs.nvidia.com/dynamo/latest/components/backends/vllm/README.html) and [SGLang](https://docs.nvidia.com/dynamo/latest/components/backends/sglang/README.html). In this recipe, we will focus on serving using the SGLang backend. -1. **Set Environment Variables** + +## Table of Contents - ```bash - export USER=$(whoami) - ``` +* [1. Test Environment](#test-environment) +* [2. Environment Setup (One-Time)](#environment-setup) + * [2.1. Clone the Repository](#clone-repo) + * [2.2. Configure Environment Variables](#configure-vars) + * [2.3. Connect to your GKE Cluster](#connect-cluster) + * [2.4. Create Secrets](#create-secrets) + * [2.5. Install Dynamo Platform](#install-platform) +* [3. Deploy with SGLang Backend](#deploy-sglang) + * [3.1. Multi-Node SGLang Deployment (16 GPUs)](#sglang-multi-node) +* [4. Inference Request](#inference-request) +* [5. Monitoring and Troubleshooting](#monitoring) +* [6. Cleanup](#cleanup) -2. **Run the Recipe** + +## 1. Test Environment - ```bash - cd $RECIPE_ROOT - helm install -f values.yaml \ - --set-file workload_launcher=$REPO_ROOT/src/launchers/dynamo-sglang-launcher.sh \ - --set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \ - --set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \ - $USER-dynamo-a4x-multi-node \ - $REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment - ``` +[Back to Top](#table-of-contents) + +This recipe has been tested with the following configuration: + +* **GKE Cluster**: + * GPU node pools with [a4x-highgpu-4g](https://docs.cloud.google.com/compute/docs/gpus#gb200-gpus) machines: + * For multi-node deployment: 4 machines with 4 GPUs each (16 GPUs total) + * [Workload Identity Federation for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/workload-identity) enabled + * [Cloud Storage FUSE CSI driver for GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/cloud-storage-fuse-csi-driver) enabled + +> [!IMPORTANT] +> To prepare the required environment, see the [GKE environment setup guide](../../../../docs/configuring-environment-gke-a4x.md). + + +## 2. Environment Setup (One-Time) + +[Back to Top](#table-of-contents) + + +### 2.1. Clone the Repository + +```bash +git clone https://github.com/ai-hypercomputer/gpu-recipes.git +cd gpu-recipes +export REPO_ROOT=$(pwd) +export RECIPE_ROOT=$REPO_ROOT/inference/a4x/disaggregated-serving/dynamo +``` + + +### 2.2. Configure Environment Variables + +```bash +export PROJECT_ID= +export CLUSTER_REGION= +export CLUSTER_NAME= +export NAMESPACE=dynamo-cloud +export NGC_API_KEY= +export HF_TOKEN= +export RELEASE_VERSION=0.7.0 + +# Set the project for gcloud commands +gcloud config set project $PROJECT_ID +``` + +Replace the following values: + +| Variable | Description | Example | +| -------- | ----------- | ------- | +| `PROJECT_ID` | Your Google Cloud Project ID | `gcp-project-12345` | +| `CLUSTER_REGION` | The GCP region where your GKE cluster is located | `us-central1` | +| `CLUSTER_NAME` | The name of your GKE cluster | `a4x-cluster` | +| `NGC_API_KEY` | Your NVIDIA NGC API key (get from [NGC](https://ngc.nvidia.com)) | `nvapi-xxx...` | +| `HF_TOKEN` | Your Hugging Face access token | `hf_xxx...` | + + +### 2.3. Connect to your GKE Cluster + +```bash +gcloud container clusters get-credentials $CLUSTER_NAME --region $CLUSTER_REGION +``` + + +### 2.4. Create Secrets + +Create the namespace: +```bash +kubectl create namespace ${NAMESPACE} +kubectl config set-context --current --namespace=$NAMESPACE +``` + +Create the Docker registry secret for NVIDIA Container Registry: +```bash +kubectl create secret docker-registry nvcr-secret \ + --namespace=${NAMESPACE} \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password=${NGC_API_KEY} +``` + +Create the secret for the Hugging Face token: +```bash +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN=${HF_TOKEN} \ + -n ${NAMESPACE} +``` + + +### 2.5. Install Dynamo Platform (One-Time Setup) + +Add the NVIDIA Helm repository: +```bash +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + --username='$oauthtoken' --password=${NGC_API_KEY} +helm repo update +``` + +Fetch the Dynamo Helm charts: +```bash +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-crds-${RELEASE_VERSION}.tgz +helm fetch https://helm.ngc.nvidia.com/nvidia/ai-dynamo/charts/dynamo-platform-${RELEASE_VERSION}.tgz +``` + +Install the Dynamo CRDs: +```bash +helm install dynamo-crds dynamo-crds-${RELEASE_VERSION}.tgz \ + --namespace default \ + --wait \ + --atomic +``` + +Install the Dynamo Platform with Grove & Kai scheduler enabled: +```bash +helm install dynamo-platform dynamo-platform-${RELEASE_VERSION}.tgz \ + --namespace ${NAMESPACE} --set grove.enabled=true --set kai-scheduler.enabled=true +``` + +Verify the installation: +```bash +kubectl get pods -n ${NAMESPACE} +``` + +Wait until all pods show a `Running` status before proceeding. + + +## 3. Deploy with SGLang Backend + +[Back to Top](#table-of-contents) + +Deploy Dynamo with SGLang backend for high-performance inference. + + +### 3.1. Multi-Node vLLM Deployment (16 GPUs) + +Multi-node deployment uses 16 GPUs across 4 A4X machines, providing increased capacity for larger models or higher throughput. + +#### DeepSeekR1 671B Model + +Deploy DeepSeekR1-671B across multiple nodes for production workloads. Note the use of `--set-file prefill_serving_config` and `--set-file decode_serving_config` pointing to the correct model config file for a multi node deployment scenario: + +```bash +cd $RECIPE_ROOT +helm install -f values.yaml \ +--set-file prefill_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml \ +--set-file decode_serving_config=$REPO_ROOT/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml \ +$USER-dynamo-a4x-multi-node \ +$REPO_ROOT/src/helm-charts/a4x/inference-templates/dynamo-deployment +``` + + +## 4. Inference Request +[Back to Top](#table-of-contents) + +To make an inference request to test the server, we can first run a health check against the server using `curl` + +```bash +kubectl exec -it -n ${NAMESPACE} deployment/$USER-dynamo-a4x-multi-node -- curl http://localhost:8000/health | jq +``` + +You should see a server status like this. Wait for it to be in a `healthy` state. + +```json +{ + "instances": [ + { + "component": "backend", + "endpoint": "load_metrics", + "instance_id": 3994861215823793160, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_backend.load_metrics-3770991c30298c08" + } + }, + { + "component": "prefill", + "endpoint": "clear_kv_blocks", + "instance_id": 3994861215823793153, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_prefill.clear_kv_blocks-3770991c30298c01" + } + }, + { + "component": "prefill", + "endpoint": "generate", + "instance_id": 3994861215823793153, + "namespace": "dynamo", + "transport": { + "nats_tcp": "dynamo_prefill.generate-3770991c30298c01" + } + } + ], + "message": "No endpoints available", + "status": "unhealthy" +} +``` + +Then we can send a benchmark request with like this: + +```bash +kubectl exec -n ${NAMESPACE} $USER-dynamo-multi-node-serving-frontend -- python3 -u -m sglang.bench_serving --backend sglang-oai-chat --base-url http://localhost:8000 --model "deepseek-ai/DeepSeek-R1" --tokenizer /data/model/deepseek-ai/DeepSeek-R1 --dataset-name random --num-prompts 2048 --random-input-len 2048 --random-output-len 512 --max-concurrency 512 +``` + + +## 5. Monitoring and Troubleshooting + +[Back to Top](#table-of-contents) + +View logs for different components (replace with your deployment name): + +You can find the exact pod name by: +```bash +kubectl get pods -n ${NAMESPACE} +``` + +Frontend logs: +```bash +kubectl logs -f deployment/$USER-dynamo-multi-node-serving-frontend -n ${NAMESPACE} +``` + +Decode worker logs: +```bash +kubectl logs -f deployment/$USER-dynamo-multi-node-serving-decode-worker -n ${NAMESPACE} +``` + +Prefill worker logs: +```bash +kubectl logs -f deployment/$USER-dynamo-multi-node-serving-prefill-worker -n ${NAMESPACE} +``` + +Common issues: + +* **Pods stuck in Pending**: Check if nodes have sufficient resources (especially for multi-node deployments) +* **Model download slow**: Large models like DeepSeekR1 671B can take 30 minutes to download +* **Multi-node issues**: Verify network connectivity between nodes and proper subnet configuration + + +## 6. Cleanup + +[Back to Top](#table-of-contents) + +List deployed releases: +```bash +helm list -n ${NAMESPACE} --filter $USER-dynamo- +``` + +Uninstall specific deployments: +```bash +helm uninstall $USER-dynamo-multi-node-serving -n ${NAMESPACE} +``` + +Uninstall Dynamo platform (if no longer needed): +```bash +helm uninstall dynamo-platform -n ${NAMESPACE} +helm uninstall dynamo-crds -n default +``` + +Delete namespace and secrets: +```bash +kubectl delete namespace ${NAMESPACE} +``` + +Clean up downloaded charts: +```bash +rm -f dynamo-crds-${RELEASE_VERSION}.tgz +rm -f dynamo-platform-${RELEASE_VERSION}.tgz +``` diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml index 71f43c74..9c271b35 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -15,11 +15,11 @@ dynamo: namespace: dynamo-cloud releaseVersion: "0.7.0" - deploymentName: disagg2p2d-yijiaj + deploymentName: disagg2p2d computeDomain: - name: yijiaj-a4x-domain + name: a4x-domain numNodes: 4 - resourceClaimTemplateName: yijiaj-a4x-channel + resourceClaimTemplateName: a4x-channel frontend: image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1 replicas: 1 @@ -185,6 +185,8 @@ workload: model: deepseek-ai/DeepSeek-R1 gpus: 16 framework: sglang + configFile: serving-args.yaml + configPath: /workload/configs network: subnetworks: [] diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index cb9fbbf0..67444375 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -106,6 +106,8 @@ spec: claims: - name: compute-domain-channel envs: + - name: SERVER_ARGS_FILE + value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} {{- if .Values.dynamo.decodeWorker.envs }} {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }} {{- end }} @@ -163,13 +165,6 @@ spec: env | grep NCCL_ echo "--- END VERIFICATION ---" - {{- if .Values.workload_launcher }} - # Use custom launcher if provided - if [ ! -f "$LAUNCHER_SCRIPT" ]; then - echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" - exit 1 - fi - ARGS=() if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" @@ -191,45 +186,8 @@ spec: fi done < "$SERVER_ARGS_FILE" fi - - echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" - exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" - {{- else }} - exec python3 -m dynamo.sglang \ - --model-path /data/model/deepseek-ai/DeepSeek-R1 \ - --served-model-name deepseek-ai/DeepSeek-R1 \ - --log-level DEBUG \ - --tp 8 \ - --dp-size 8 \ - --decode-log-interval 1 \ - --page-size 1 \ - --enable-dp-attention \ - --trust-remote-code \ - --disaggregation-mode decode \ - --disaggregation-transfer-backend nixl \ - --disaggregation-bootstrap-port 30001 \ - --host 0.0.0.0 \ - --port 9090 \ - --decode-log-interval 1 \ - --max-running-requests 36864 \ - --context-length 2716 \ - --disable-radix-cache \ - --moe-a2a-backend deepep \ - --prefill-round-robin-balance \ - --deepep-mode normal \ - --moe-dense-tp-size 1 \ - --enable-dp-lm-head \ - --disable-cuda-graph \ - --cuda-graph-max-bs 256 \ - --disable-shared-experts-fusion \ - --ep-num-redundant-experts 32 \ - --ep-dispatch-algorithm static \ - --eplb-algorithm deepseek \ - --attention-backend cutlass_mla \ - --watchdog-timeout 1000000 \ - --chunked-prefill-size 36864 \ - --mem-fraction-static 0.8 - {{- end }} + echo "Running: python3 -m dynamo.sglang ${ARGS[@]}" + exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: - mountPath: /data/model @@ -238,12 +196,8 @@ spec: mountPath: /usr/local/nvidia - name: gib mountPath: /usr/local/gib - {{- if .Values.workload_launcher }} - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} - - name: serving-launcher - mountPath: /workload/launcher - {{- end }} volumes: - name: gcs-model-volume csi: @@ -257,19 +211,12 @@ spec: - name: gib hostPath: path: /home/kubernetes/bin/gib - {{- if .Values.workload_launcher }} - name: serving-configuration configMap: name: "{{ .Release.Name }}-decode-config" items: - key: serving-configuration path: {{ .Values.workload.configFile | default "serving-args.yaml" }} - - name: serving-launcher - configMap: - name: "{{ .Release.Name }}-launcher" - defaultMode: 0700 - {{- end }} - Prefill: multinode: @@ -303,6 +250,8 @@ spec: claims: - name: compute-domain-channel envs: + - name: SERVER_ARGS_FILE + value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} {{- if .Values.dynamo.prefillWorker.envs }} {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }} {{- end }} @@ -356,16 +305,9 @@ spec: nvidia-smi . /usr/local/gib/scripts/set_nccl_env.sh echo "Pre-compiling DeepGEMM kernels for Prefill Worker..." - echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker." - {{- if .Values.workload_launcher }} - # Use custom launcher if provided - if [ ! -f "$LAUNCHER_SCRIPT" ]; then - echo "Error: Launcher script $LAUNCHER_SCRIPT not found!" - exit 1 - fi - ARGS=("--is-prefill-worker") + ARGS=() if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" while IFS=': ' read -r key value || [ -n "$key" ]; do @@ -386,45 +328,8 @@ spec: fi done < "$SERVER_ARGS_FILE" fi - - echo "Running: $LAUNCHER_SCRIPT ${ARGS[@]}" - exec "$LAUNCHER_SCRIPT" "${ARGS[@]}" - {{- else }} - exec python3 -m dynamo.sglang \ - --model-path /data/model/deepseek-ai/DeepSeek-R1 \ - --served-model-name deepseek-ai/DeepSeek-R1 \ - --log-level DEBUG \ - --tp 8 \ - --dp-size 8 \ - --trust-remote-code \ - --decode-log-interval 1 \ - --page-size 1 \ - --enable-dp-attention \ - --disaggregation-mode prefill \ - --disaggregation-transfer-backend nixl \ - --disaggregation-bootstrap-port 30001 \ - --host 0.0.0.0 \ - --port 9090 \ - --decode-log-interval 1 \ - --max-running-requests 6144 \ - --context-length 2716 \ - --disable-radix-cache \ - --moe-a2a-backend deepep \ - --load-balance-method round_robin \ - --deepep-mode normal \ - --moe-dense-tp-size 1 \ - --enable-dp-lm-head \ - --disable-shared-experts-fusion \ - --ep-num-redundant-experts 32 \ - --ep-dispatch-algorithm static \ - --eplb-algorithm deepseek \ - --attention-backend cutlass_mla \ - --watchdog-timeout 1000000 \ - --disable-cuda-graph \ - --chunked-prefill-size 16384 \ - --max-total-tokens 32768 \ - --mem-fraction-static 0.8 - {{- end }} + echo "Running: python3 -m dynamo.sglang ${ARGS[@]}" + exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: - mountPath: /data/model @@ -433,12 +338,8 @@ spec: mountPath: /usr/local/nvidia - name: gib mountPath: /usr/local/gib - {{- if .Values.workload_launcher }} - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} - - name: serving-launcher - mountPath: /workload/launcher - {{- end }} volumes: - name: gcs-model-volume csi: @@ -452,15 +353,9 @@ spec: - name: gib hostPath: path: /home/kubernetes/bin/gib - {{- if .Values.workload_launcher }} - name: serving-configuration configMap: name: "{{ .Release.Name }}-prefill-config" items: - key: serving-configuration - path: {{ .Values.workload.configFile | default "serving-args.yaml" }} - - name: serving-launcher - configMap: - name: "{{ .Release.Name }}-launcher" - defaultMode: 0700 - {{- end }} + path: {{ .Values.workload.configFile | default "serving-args.yaml" }} \ No newline at end of file From e7503e8c5efa13a7f5c3aab439d534b886dcd572 Mon Sep 17 00:00:00 2001 From: Yijia J Date: Sat, 10 Jan 2026 06:13:31 +0000 Subject: [PATCH 5/5] add 10p8d configs, add path without gcsfuse --- .../disaggregated-serving/dynamo/values.yaml | 69 ++++++++++++------- .../deepseekr1-fp8-10p8d-decode.yaml | 50 ++++++++++++++ .../deepseekr1-fp8-10p8d-prefill.yaml | 50 ++++++++++++++ ...e.yaml => deepseekr1-fp8-2p2d-decode.yaml} | 1 - ....yaml => deepseekr1-fp8-2p2d-prefill.yaml} | 1 - .../templates/dynamo-graph-deployment.yaml | 59 ++++++++++++++-- 6 files changed, 197 insertions(+), 33 deletions(-) create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml create mode 100644 src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-multi-node-decode.yaml => deepseekr1-fp8-2p2d-decode.yaml} (96%) rename src/frameworks/a4x/dynamo-configs/{deepseekr1-fp8-multi-node-prefill.yaml => deepseekr1-fp8-2p2d-prefill.yaml} (96%) diff --git a/inference/a4x/disaggregated-serving/dynamo/values.yaml b/inference/a4x/disaggregated-serving/dynamo/values.yaml index 9c271b35..a047a65f 100644 --- a/inference/a4x/disaggregated-serving/dynamo/values.yaml +++ b/inference/a4x/disaggregated-serving/dynamo/values.yaml @@ -13,15 +13,16 @@ # limitations under the License. dynamo: - namespace: dynamo-cloud + namespace: yijiaj-test releaseVersion: "0.7.0" - deploymentName: disagg2p2d + deploymentName: disagg2p2d-yijiaj computeDomain: - name: a4x-domain + name: yijiaj-a4x-domain numNodes: 4 - resourceClaimTemplateName: a4x-channel + resourceClaimTemplateName: yijiaj-a4x-channel + serviceAccountName: dynamo-platform-dynamo-operator-component frontend: - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.6.1 + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.7.0 replicas: 1 livenessProbe: initialDelaySeconds: 3000 @@ -34,24 +35,34 @@ dynamo: timeoutSeconds: 300 failureThreshold: 100 decodeWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 nodeCount: 2 replicas: 1 envs: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: HF_TOKEN + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" - name: LD_LIBRARY_PATH value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" - name: GLOO_SOCKET_IFNAME value: eth0 - name: TP_SOCKET_IFNAME value: eth0 - - name: SGLANG_ENABLE_JIT_DEEPGEMM - value: "1" + # - name: SGLANG_ENABLE_JIT_DEEPGEMM + # value: "1" - name: DYN_SKIP_SGLANG_LOG_FORMATTING value: "1" + - name: SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK + value: "256" - name: MC_TE_METRIC value: "true" - - name: SGLANG_ENABLE_FLASHINFER_GEMM - value: "1" + # - name: SGLANG_ENABLE_FLASHINFER_GEMM + # value: "1" - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE value: "100000" - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT @@ -76,14 +87,14 @@ dynamo: value: "1" - name: PYTHONUNBUFFERED value: "1" - - name: NCCL_DEBUG - value: INFO - - name: NCCL_DEBUG_SUBSYS - value: INIT,BOOTSTRAP,ENV,NET,GRAPH - - name: NCCL_SOCKET_FAMILY - value: "AF_INET" - - name: GLOO_SOCKET_FAMILY - value: "AF_INET" + # - name: NCCL_DEBUG + # value: INFO + # - name: NCCL_DEBUG_SUBSYS + # value: INIT,BOOTSTRAP,ENV,NET,GRAPH + # - name: NCCL_SOCKET_FAMILY + # value: "AF_INET" + # - name: GLOO_SOCKET_FAMILY + # value: "AF_INET" livenessProbe: initialDelaySeconds: 3000 periodSeconds: 60 @@ -100,10 +111,18 @@ dynamo: timeoutSeconds: 600 failureThreshold: 3000 prefillWorker: - image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 + image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.7.0-sglang-0.5.5.post2-timeout + #image: us-central1-docker.pkg.dev/linglinll-gke-dev/dynamo/dynamo-base:dynamo-wideep-gb200-v0.6.1 nodeCount: 2 replicas: 1 envs: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: HF_TOKEN + - name: HF_HUB_ENABLE_HF_TRANSFER + value: "1" - name: LD_LIBRARY_PATH value: "/usr/local/ucx/lib:/usr/local/ucx/lib/ucx:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu:/opt/nvidia/nvda_nixl/lib/aarch64-linux-gnu/plugins:/usr/local/nvidia/lib64" - name: UCX_TLS @@ -112,14 +131,14 @@ dynamo: value: eth0 - name: TP_SOCKET_IFNAME value: eth0 - - name: SGLANG_ENABLE_JIT_DEEPGEMM - value: "1" + # - name: SGLANG_ENABLE_JIT_DEEPGEMM + # value: "1" - name: DYN_SKIP_SGLANG_LOG_FORMATTING value: "1" - name: MC_TE_METRIC value: "true" - - name: SGLANG_ENABLE_FLASHINFER_GEMM - value: "1" + # - name: SGLANG_ENABLE_FLASHINFER_GEMM + # value: "1" - name: SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE value: "100000" - name: SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT @@ -166,14 +185,14 @@ secrets: token: "hf_api_token" volumes: + useGcs: true gcsfuse: bucketName: "yijiaj-test" fileCacheCapacity: "500G" cachePath: "/gcs-cache" ssdMountPath: "/ssd" gcsMounts: - - bucketName: "yijiaj-test" - mountPath: "/data/model" + mountPath: "/data/model" service: type: ClusterIP diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml new file mode 100644 index 00000000..bbbdf18f --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-decode.yaml @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +served-model-name: deepseek-ai/DeepSeek-R1 +disaggregation-mode: decode +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +trust-remote-code: true +skip-tokenizer-init: true +tp-size: "32" +dp-size: "32" +ep-size: "32" +quantization: "fp8" +# page-size: "1" +enable-dp-attention: true +attention-backend: "trtllm_mla" +kv-cache-dtype: "fp8_e4m3" +disable-radix-cache: true +stream-interval: "50" +# disaggregation-transfer-backend: nixl +decode-log-interval: "1000" +max-running-requests: "8192" +context-length: "9300" +watchdog-timeout: "1000000" +disable-shared-experts-fusion: true +eplb-algorithm: deepseek +mem-fraction-static: "0.82" +chunked-prefill-size: "36864" +moe-a2a-backend: "deepep" +deepep-mode: "low_latency" +ep-dispatch-algorithm: static +moe-dense-tp-size: "1" +enable-dp-lm-head: true +prefill-round-robin-balance: true +ep-num-redundant-experts: "32" +cuda-graph-max-bs: "256" +# disable-cuda-graph: true +deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}' \ No newline at end of file diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml new file mode 100644 index 00000000..f5748607 --- /dev/null +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-10p8d-prefill.yaml @@ -0,0 +1,50 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +served-model-name: deepseek-ai/DeepSeek-R1 +# log-level: DEBUG +disaggregation-mode: prefill +disaggregation-bootstrap-port: "30001" +host: "0.0.0.0" +port: "9090" +trust-remote-code: true +tp-size: "8" +dp-size: "8" +ep-size: "8" +quantization: "fp8" +enable-dp-attention: true +attention-backend: "trtllm_mla" +kv-cache-dtype: "fp8_e4m3" +disable-radix-cache: true +stream-interval: "50" +max-running-requests: "30000" +context-length: "9300" +# decode-log-interval: "1" +# page-size: "1" +# disaggregation-transfer-backend: nixl +watchdog-timeout: "1000000" +disable-shared-experts-fusion: true +eplb-algorithm: deepseek +mem-fraction-static: "0.8" +max-total-tokens: "524288" +chunked-prefill-size: "131072" +load-balance-method: round_robin +disable-cuda-graph: true +moe-a2a-backend: deepep +deepep-mode: normal +ep-dispatch-algorithm: "dynamic" +moe-dense-tp-size: "1" +enable-dp-lm-head: true +ep-num-redundant-experts: "32" +deepep-config: '{"normal_dispatch": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 28,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 256}, "normal_combine": {"num_sms": 128,"num_max_nvl_chunked_send_tokens": 15,"num_max_nvl_chunked_recv_tokens": 256,"num_max_rdma_chunked_send_tokens": 6,"num_max_rdma_chunked_recv_tokens": 128}}' \ No newline at end of file diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml similarity index 96% rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml index 82029f49..a2287217 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-decode.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-decode.yaml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -model-path: /data/model/deepseek-ai/DeepSeek-R1 served-model-name: deepseek-ai/DeepSeek-R1 log-level: DEBUG tp: "8" diff --git a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml similarity index 96% rename from src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml rename to src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml index 939aa2cc..f2abbcd4 100644 --- a/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-multi-node-prefill.yaml +++ b/src/frameworks/a4x/dynamo-configs/deepseekr1-fp8-2p2d-prefill.yaml @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -model-path: /data/model/deepseek-ai/DeepSeek-R1 served-model-name: deepseek-ai/DeepSeek-R1 log-level: DEBUG tp: "8" diff --git a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml index 67444375..0ac6cdf5 100644 --- a/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml +++ b/src/helm-charts/a4x/inference-templates/dynamo-deployment/templates/dynamo-graph-deployment.yaml @@ -35,12 +35,14 @@ spec: memory: "10Gi" extraPodMetadata: annotations: + {{- if eq .Values.volumes.useGcs true }} gke-gcsfuse/volumes: "true" gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" gke-gcsfuse/file-cache-capacity: "500Gi" gke-gcsfuse/cache-path: "/gcs-cache" + {{- end }} extraPodSpec: tolerations: - key: "kubernetes.io/arch" @@ -53,21 +55,24 @@ spec: volumes: - name: local-ssd emptyDir: {} + {{- if eq .Values.volumes.useGcs true }} - name: gcs-model-volume csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.volumes.gcsfuse.bucketName }} mountOptions: "implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:50,file-cache:max-size-mb:-1" - + {{- end }} mainContainer: image: {{ .Values.dynamo.frontend.image }} + {{- if eq .Values.volumes.useGcs true }} volumeMounts: - name: local-ssd mountPath: /gcs-cache - name: gcs-model-volume mountPath: /data/model readOnly: true + {{- end }} resources: requests: ephemeral-storage: "30Gi" @@ -108,15 +113,21 @@ spec: envs: - name: SERVER_ARGS_FILE value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} + {{- if eq .Values.volumes.useGcs true }} + - name: MODEL_PATH + value: {{ .Values.volumes.gcsMounts.mountPath }}/{{ .Values.workload.model }} + {{- end }} {{- if .Values.dynamo.decodeWorker.envs }} {{- toYaml .Values.dynamo.decodeWorker.envs | nindent 8 }} {{- end }} extraPodMetadata: annotations: + {{- if eq .Values.volumes.useGcs true }} gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/volumes: "true" + {{- end }} networking.gke.io/default-interface: 'eth0' networking.gke.io/interfaces: | [ @@ -127,6 +138,9 @@ spec: {"interfaceName":"eth5","network":"rdma-3"} ] extraPodSpec: + {{- if .Values.dynamo.serviceAccountName }} + serviceAccountName: {{ .Values.dynamo.serviceAccountName }} + {{- end }} resourceClaims: - name: compute-domain-channel resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} @@ -164,8 +178,16 @@ spec: echo "--- VERIFYING NCCL ENV VARS IN SHELL ---" env | grep NCCL_ echo "--- END VERIFICATION ---" + pip install hf_transfer ARGS=() + if [ -n "$MODEL_PATH" ]; then + echo "Adding model path from env var: $MODEL_PATH" + ARGS+=("--model-path" "$MODEL_PATH") + else + echo "No MODEL_PATH env var set from gcsfuse, relying on config file for model" + ARGS+=("--model" "{{ .Values.workload.model }}") + fi if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" while IFS=': ' read -r key value || [ -n "$key" ]; do @@ -190,8 +212,10 @@ spec: exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: + {{- if eq .Values.volumes.useGcs true }} - mountPath: /data/model name: gcs-model-volume + {{- end }} - name: library-dir-host mountPath: /usr/local/nvidia - name: gib @@ -199,12 +223,14 @@ spec: - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} volumes: + {{- if eq .Values.volumes.useGcs true }} - name: gcs-model-volume csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.volumes.gcsfuse.bucketName }} mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + {{- end }} - name: library-dir-host hostPath: path: /home/kubernetes/bin/nvidia @@ -227,9 +253,11 @@ spec: subComponentType: prefill replicas: {{ .Values.dynamo.prefillWorker.replicas }} livenessProbe: - httpGet: - path: /live - port: system + exec: + command: + - /bin/sh + - -c + - "exit 0" initialDelaySeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.initialDelaySeconds }} periodSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.periodSeconds }} timeoutSeconds: {{ .Values.dynamo.prefillWorker.livenessProbe.timeoutSeconds }} @@ -252,15 +280,21 @@ spec: envs: - name: SERVER_ARGS_FILE value: {{ .Values.workload.configPath }}/{{ .Values.workload.configFile }} + {{- if eq .Values.volumes.useGcs true }} + - name: MODEL_PATH + value: {{ .Values.volumes.gcsMounts.mountPath }}/{{ .Values.workload.model }} + {{- end }} {{- if .Values.dynamo.prefillWorker.envs }} {{- toYaml .Values.dynamo.prefillWorker.envs | nindent 8 }} {{- end }} extraPodMetadata: annotations: + {{- if eq .Values.volumes.useGcs true }} gke-gcsfuse/cpu-limit: "0" gke-gcsfuse/ephemeral-storage-limit: "0" gke-gcsfuse/memory-limit: "0" gke-gcsfuse/volumes: "true" + {{- end }} networking.gke.io/default-interface: 'eth0' networking.gke.io/interfaces: | [ @@ -271,6 +305,9 @@ spec: {"interfaceName":"eth5","network":"rdma-3"} ] extraPodSpec: + {{- if .Values.dynamo.serviceAccountName }} + serviceAccountName: {{ .Values.dynamo.serviceAccountName }} + {{- end }} resourceClaims: - name: compute-domain-channel resourceClaimTemplateName: {{ .Values.dynamo.computeDomain.resourceClaimTemplateName }} @@ -304,10 +341,16 @@ spec: set -e nvidia-smi . /usr/local/gib/scripts/set_nccl_env.sh - echo "Pre-compiling DeepGEMM kernels for Prefill Worker..." - echo "Finished pre-compiling DeepGEMM kernels for Prefill Worker." + pip install hf_transfer ARGS=() + if [ -n "$MODEL_PATH" ]; then + echo "Adding model path from env var: $MODEL_PATH" + ARGS+=("--model-path" "$MODEL_PATH") + else + echo "No MODEL_PATH env var set from gcsfuse, relying on config file for model" + ARGS+=("--model" "{{ .Values.workload.model }}") + fi if [ -f "$SERVER_ARGS_FILE" ]; then echo "Loading server arguments from ConfigMap" while IFS=': ' read -r key value || [ -n "$key" ]; do @@ -332,8 +375,10 @@ spec: exec python3 -m dynamo.sglang "${ARGS[@]}" volumeMounts: + {{- if eq .Values.volumes.useGcs true }} - mountPath: /data/model name: gcs-model-volume + {{- end }} - name: library-dir-host mountPath: /usr/local/nvidia - name: gib @@ -341,12 +386,14 @@ spec: - name: serving-configuration mountPath: {{ .Values.workload.configPath | default "/workload/configs" }} volumes: + {{- if eq .Values.volumes.useGcs true }} - name: gcs-model-volume csi: driver: gcsfuse.csi.storage.gke.io volumeAttributes: bucketName: {{ .Values.volumes.gcsfuse.bucketName }} mountOptions: implicit-dirs,file-cache:enable-parallel-downloads:true,file-cache:parallel-downloads-per-file:100,file-cache:max-parallel-downloads:-1,file-cache:download-chunk-size-mb:10,file-cache:max-size-mb:-1 + {{- end }} - name: library-dir-host hostPath: path: /home/kubernetes/bin/nvidia