|
1 | | -# Copyright 2025 Google LLC |
2 | | -# |
3 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | -# you may not use this file except in compliance with the License. |
5 | | -# You may obtain a copy of the License at |
6 | | -# |
7 | | -# http://www.apache.org/licenses/LICENSE-2.0 |
8 | | -# |
9 | | -# Unless required by applicable law or agreed to in writing, software |
10 | | -# distributed under the License is distributed on an "AS IS" BASIS, |
11 | | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | -# See the License for the specific language governing permissions and |
13 | | -# limitations under the License. |
14 | | - |
15 | | -apiVersion: jobset.x-k8s.io/v1alpha2 |
16 | | -kind: JobSet |
17 | | -metadata: |
18 | | - name: pathways-jobset-inference |
19 | | - # annotations: |
20 | | - # alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment |
21 | | -spec: |
22 | | - failurePolicy: |
23 | | - maxRestarts: 4 # The set will be restarted on failures up to 4 times. |
24 | | - replicatedJobs: |
25 | | - - name: leader # Part of the name of the child Jobs (<replicateJobName>) |
26 | | - replicas: 1 # Replicas of the Pathways Resource Manager, Proxy, JetStream and Tester. Should always be 1. |
27 | | - template: |
28 | | - spec: # JobSpec |
29 | | - parallelism: 1 # Must be set to number of nodes in each node pool |
30 | | - completions: 1 # Must be set to number of nodes in each node pool |
31 | | - backoffLimit: 0 # Must be set to 0. Fail the job when any pod fails. |
32 | | - template: |
33 | | - spec: |
34 | | - affinity: |
35 | | - podAffinity: |
36 | | - requiredDuringSchedulingIgnoredDuringExecution: |
37 | | - - labelSelector: |
38 | | - matchExpressions: |
39 | | - - key: jobset.sigs.k8s.io/jobset-name |
40 | | - operator: In |
41 | | - values: |
42 | | - - pathways-jobset-inference |
43 | | - topologyKey: cloud.google.com/gke-nodepool |
44 | | - podAntiAffinity: # ensures only this job lands on the rack |
45 | | - requiredDuringSchedulingIgnoredDuringExecution: |
46 | | - - labelSelector: |
47 | | - matchExpressions: |
48 | | - - key: jobset.sigs.k8s.io/jobset-name |
49 | | - operator: NotIn |
50 | | - values: |
51 | | - - pathways-jobset-inference |
52 | | - - key: job-name |
53 | | - operator: Exists |
54 | | - namespaceSelector: {} |
55 | | - topologyKey: cloud.google.com/gke-nodepool |
56 | | - nodeSelector: |
57 | | - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice |
58 | | - cloud.google.com/gke-tpu-topology: 2x2x2 |
59 | | - tolerations: |
60 | | - - effect: NoSchedule |
61 | | - key: google.com/tpu |
62 | | - operator: Exists |
63 | | - # nodeSelector: |
64 | | - # cloud.google.com/gke-nodepool: cpu-user-np |
65 | | - volumes: |
66 | | - - name: shared-tmp |
67 | | - hostPath: |
68 | | - path: /tmp |
69 | | - type: DirectoryOrCreate |
70 | | - containers: |
71 | | - - name: pathways-rm |
72 | | - image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/server:latest |
73 | | - imagePullPolicy: Always # Sometimes k8s was reusing the old image |
74 | | - args: |
75 | | - - --alsologtostderr |
76 | | - - --pathways_server_port=38677 |
77 | | - - --pathways_server_provides_devices=false |
78 | | - - --pathways_device_type=NONE |
79 | | - - --pathways_persistent_compilation_cache=false |
80 | | - - --pathways_compilation_mode=compile_at_worker |
81 | | - - --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp |
82 | | - - --pathways_expected_instances=tpuv4:2x2x2 |
83 | | - env: |
84 | | - - name: TPU_SKIP_MDS_QUERY |
85 | | - value: "true" |
86 | | - - name: REPLICATED_JOB_NAME |
87 | | - valueFrom: |
88 | | - fieldRef: |
89 | | - fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name'] |
90 | | - - name: JOBSET_NAME |
91 | | - valueFrom: |
92 | | - fieldRef: |
93 | | - fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name'] |
94 | | - - name: HOST_ADDRESS |
95 | | - value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME) |
96 | | - ports: |
97 | | - - containerPort: 38677 |
98 | | - - containerPort: 38678 |
99 | | - resources: |
100 | | - limits: |
101 | | - cpu: "4" |
102 | | - memory: "8G" |
103 | | - securityContext: |
104 | | - privileged: true |
105 | | - - name: pathways-proxy |
106 | | - args: |
107 | | - - --alsologtostderr |
108 | | - - --v=0 |
109 | | - - --pathways_ifrt_proxy_server_resource_manager=pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38677 |
110 | | - # - --pathways_ifrt_proxy_server_resource_manager=localhost:38677 |
111 | | - - --pathways_ifrt_proxy_server_port=38681 |
112 | | - - --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp |
113 | | - - --pathways_plaque_network=gcp |
114 | | - image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/proxy_server:latest |
115 | | - imagePullPolicy: Always |
116 | | - ports: |
117 | | - - containerPort: 38681 |
118 | | - - containerPort: 38682 |
119 | | - resources: |
120 | | - limits: |
121 | | - cpu: "4" |
122 | | - memory: 10G |
123 | | - securityContext: |
124 | | - privileged: true |
125 | | - - name: jetstream |
126 | | - image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest |
127 | | - imagePullPolicy: Always |
128 | | - ports: |
129 | | - - containerPort: 9000 |
130 | | - env: |
131 | | - - name: XCLOUD_ENVIRONMENT |
132 | | - value: GCP |
133 | | - - name: JAX_PLATFORMS |
134 | | - value: proxy |
135 | | - - name: JAX_BACKEND_TARGET |
136 | | - value: grpc://pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38681 |
137 | | - # value: grpc://localhost:38681 |
138 | | - command: |
139 | | - - bash |
140 | | - - -c |
141 | | - - 'echo Start: $(date); _sigterm() ( kill -SIGTERM $! 2>/dev/null;); trap |
142 | | - _sigterm SIGTERM; (JAX_TRACEBACK_FILTERING=off python3 MaxText/maxengine_server.py |
143 | | - MaxText/configs/inference_jetstream.yml tokenizer_path=assets/tokenizer.llama2 |
144 | | - load_parameters_path=gs://runner-maxtext-logs/2024-05-07-23-34/unscanned_chkpt/checkpoints/0/items |
145 | | - max_prefill_predict_length=1024 max_target_length=2048 async_checkpointing=false |
146 | | - model_name=''llama2-70b'' steps=1 ici_fsdp_parallelism=1 ici_autoregressive_parallelism=-1 |
147 | | - ici_tensor_parallelism=1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=2) |
148 | | - & PID=$!; while kill -0 $PID 2>/dev/null; do sleep 5; done; wait $PID; |
149 | | - EXIT_CODE=$? echo EXIT_CODE=$EXIT_CODE; echo End sleep: $(date); sleep |
150 | | - infinity;' |
151 | | - - name: tester |
152 | | - image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest |
153 | | - imagePullPolicy: Always |
154 | | - env: null |
155 | | - command: |
156 | | - - bash |
157 | | - - -c |
158 | | - - 'echo Start: $(date); _sigterm() ( kill -SIGTERM $! 2>/dev/null;); trap |
159 | | - _sigterm SIGTERM; for i in {1..5}; do echo Sending request $i; time python3 |
160 | | - JetStream/jetstream/tools/requester.py --tokenizer assets/tokenizer.llama2 |
161 | | - --max_tokens=16 --server=0.0.0.0 --text="why earth is round"; EXIT_CODE=$?; |
162 | | - echo Completed request; echo EXIT_CODE=$EXIT_CODE; if [[ $EXIT_CODE -ne |
163 | | - 0 ]]; then break; fi; done; echo Last EXIT_CODE=$EXIT_CODE; echo End sleep: |
164 | | - $(date); sleep infinity;' |
165 | | - securityContext: |
166 | | - privileged: true |
167 | | - - name: worker # Part of the name of the child Jobs (<replicateJobName>) |
168 | | - replicas: 1 # Number of slices |
169 | | - template: |
170 | | - spec: |
171 | | - parallelism: 2 # Must be set to number of nodes in each node pool |
172 | | - completions: 2 # Must be set to number of nodes in each node pool |
173 | | - backoffLimit: 0 # Must be set to 0. Fail the job when any pod fails. |
174 | | - template: |
175 | | - spec: |
176 | | - nodeSelector: |
177 | | - cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice |
178 | | - cloud.google.com/gke-tpu-topology: 2x2x2 |
179 | | - volumes: |
180 | | - - name: shared-tmp |
181 | | - hostPath: |
182 | | - path: /tmp |
183 | | - type: DirectoryOrCreate |
184 | | - containers: |
185 | | - - name: pathways-worker |
186 | | - securityContext: |
187 | | - privileged: true |
188 | | - image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/server:latest |
189 | | - imagePullPolicy: Always # Sometimes k8s was reusing the old image |
190 | | - env: |
191 | | - - name: TPU_MIN_LOG_LEVEL |
192 | | - value: "0" |
193 | | - - name: TF_CPP_MIN_LOG_LEVEL |
194 | | - value: "0" |
195 | | - - name: XCLOUD_ENVIRONMENT |
196 | | - value: GCP |
197 | | - args: |
198 | | - - --alsologtostderr |
199 | | - - --pathways_server_port=38679 # changed to not match rm port |
200 | | - - --pathways_resource_manager=pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38677 |
201 | | - - --pathways_persistent_compilation_cache=false |
202 | | - - --pathways_compilation_mode=compile_at_worker |
203 | | - - --xla_tpu_enable_data_parallel_all_reduce_opt=true |
204 | | - - --xla_tpu_data_parallel_opt_different_sized_ops=true |
205 | | - - --xla_tpu_enable_async_collective_fusion=true |
206 | | - - --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true |
207 | | - - --xla_tpu_enable_async_collective_fusion_multiple_steps=true |
208 | | - - --xla_tpu_overlap_compute_collective_tc=true |
209 | | - - --xla_enable_async_all_gather=true |
210 | | - - --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp |
211 | | - ports: |
212 | | - - containerPort: 38679 |
213 | | - - containerPort: 38680 |
214 | | - - containerPort: 8471 |
215 | | - - containerPort: 8080 |
216 | | - resources: |
217 | | - limits: |
218 | | - google.com/tpu: 4 # Number of TPU chips per worker |
0 commit comments