Skip to content

Commit 2b9eac4

Browse files
committed
Add spec maps for concrete validations, fix headless mode, cleanup.
1 parent 82394dc commit 2b9eac4

6 files changed

Lines changed: 154 additions & 475 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# pathways-job
22
PathwaysJob API is an OSS Kubernetes-native API, to deploy ML training and batch inference workloads, using Pathways on GKE.
3-
//ToDo(roshanin) - an intro of what Pathways is.
3+
//ToDo(roshanin) - add intro for Pathways.
44
## Description
55
The PathwaysJob is an API that provides an easy way to run JAX workloads using Pathways. It support two modes of deployment.
66
### Colocate mode

api/v1/pathwaysjob_types.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2222
)
2323

24-
// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!
2524
// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.
2625

2726
// +kubebuilder:object:root=true

config/samples/jobset_example.yaml

Lines changed: 0 additions & 218 deletions
Original file line numberDiff line numberDiff line change
@@ -1,218 +0,0 @@
1-
# Copyright 2025 Google LLC
2-
#
3-
# Licensed under the Apache License, Version 2.0 (the "License");
4-
# you may not use this file except in compliance with the License.
5-
# You may obtain a copy of the License at
6-
#
7-
# http://www.apache.org/licenses/LICENSE-2.0
8-
#
9-
# Unless required by applicable law or agreed to in writing, software
10-
# distributed under the License is distributed on an "AS IS" BASIS,
11-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12-
# See the License for the specific language governing permissions and
13-
# limitations under the License.
14-
15-
apiVersion: jobset.x-k8s.io/v1alpha2
16-
kind: JobSet
17-
metadata:
18-
name: pathways-jobset-inference
19-
# annotations:
20-
# alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool # 1:1 job replica to node pool assignment
21-
spec:
22-
failurePolicy:
23-
maxRestarts: 4 # The set will be restarted on failures up to 4 times.
24-
replicatedJobs:
25-
- name: leader # Part of the name of the child Jobs (<replicateJobName>)
26-
replicas: 1 # Replicas of the Pathways Resource Manager, Proxy, JetStream and Tester. Should always be 1.
27-
template:
28-
spec: # JobSpec
29-
parallelism: 1 # Must be set to number of nodes in each node pool
30-
completions: 1 # Must be set to number of nodes in each node pool
31-
backoffLimit: 0 # Must be set to 0. Fail the job when any pod fails.
32-
template:
33-
spec:
34-
affinity:
35-
podAffinity:
36-
requiredDuringSchedulingIgnoredDuringExecution:
37-
- labelSelector:
38-
matchExpressions:
39-
- key: jobset.sigs.k8s.io/jobset-name
40-
operator: In
41-
values:
42-
- pathways-jobset-inference
43-
topologyKey: cloud.google.com/gke-nodepool
44-
podAntiAffinity: # ensures only this job lands on the rack
45-
requiredDuringSchedulingIgnoredDuringExecution:
46-
- labelSelector:
47-
matchExpressions:
48-
- key: jobset.sigs.k8s.io/jobset-name
49-
operator: NotIn
50-
values:
51-
- pathways-jobset-inference
52-
- key: job-name
53-
operator: Exists
54-
namespaceSelector: {}
55-
topologyKey: cloud.google.com/gke-nodepool
56-
nodeSelector:
57-
cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
58-
cloud.google.com/gke-tpu-topology: 2x2x2
59-
tolerations:
60-
- effect: NoSchedule
61-
key: google.com/tpu
62-
operator: Exists
63-
# nodeSelector:
64-
# cloud.google.com/gke-nodepool: cpu-user-np
65-
volumes:
66-
- name: shared-tmp
67-
hostPath:
68-
path: /tmp
69-
type: DirectoryOrCreate
70-
containers:
71-
- name: pathways-rm
72-
image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/server:latest
73-
imagePullPolicy: Always # Sometimes k8s was reusing the old image
74-
args:
75-
- --alsologtostderr
76-
- --pathways_server_port=38677
77-
- --pathways_server_provides_devices=false
78-
- --pathways_device_type=NONE
79-
- --pathways_persistent_compilation_cache=false
80-
- --pathways_compilation_mode=compile_at_worker
81-
- --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp
82-
- --pathways_expected_instances=tpuv4:2x2x2
83-
env:
84-
- name: TPU_SKIP_MDS_QUERY
85-
value: "true"
86-
- name: REPLICATED_JOB_NAME
87-
valueFrom:
88-
fieldRef:
89-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
90-
- name: JOBSET_NAME
91-
valueFrom:
92-
fieldRef:
93-
fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
94-
- name: HOST_ADDRESS
95-
value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
96-
ports:
97-
- containerPort: 38677
98-
- containerPort: 38678
99-
resources:
100-
limits:
101-
cpu: "4"
102-
memory: "8G"
103-
securityContext:
104-
privileged: true
105-
- name: pathways-proxy
106-
args:
107-
- --alsologtostderr
108-
- --v=0
109-
- --pathways_ifrt_proxy_server_resource_manager=pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38677
110-
# - --pathways_ifrt_proxy_server_resource_manager=localhost:38677
111-
- --pathways_ifrt_proxy_server_port=38681
112-
- --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp
113-
- --pathways_plaque_network=gcp
114-
image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/proxy_server:latest
115-
imagePullPolicy: Always
116-
ports:
117-
- containerPort: 38681
118-
- containerPort: 38682
119-
resources:
120-
limits:
121-
cpu: "4"
122-
memory: 10G
123-
securityContext:
124-
privileged: true
125-
- name: jetstream
126-
image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
127-
imagePullPolicy: Always
128-
ports:
129-
- containerPort: 9000
130-
env:
131-
- name: XCLOUD_ENVIRONMENT
132-
value: GCP
133-
- name: JAX_PLATFORMS
134-
value: proxy
135-
- name: JAX_BACKEND_TARGET
136-
value: grpc://pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38681
137-
# value: grpc://localhost:38681
138-
command:
139-
- bash
140-
- -c
141-
- 'echo Start: $(date); _sigterm() ( kill -SIGTERM $! 2>/dev/null;); trap
142-
_sigterm SIGTERM; (JAX_TRACEBACK_FILTERING=off python3 MaxText/maxengine_server.py
143-
MaxText/configs/inference_jetstream.yml tokenizer_path=assets/tokenizer.llama2
144-
load_parameters_path=gs://runner-maxtext-logs/2024-05-07-23-34/unscanned_chkpt/checkpoints/0/items
145-
max_prefill_predict_length=1024 max_target_length=2048 async_checkpointing=false
146-
model_name=''llama2-70b'' steps=1 ici_fsdp_parallelism=1 ici_autoregressive_parallelism=-1
147-
ici_tensor_parallelism=1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=2)
148-
& PID=$!; while kill -0 $PID 2>/dev/null; do sleep 5; done; wait $PID;
149-
EXIT_CODE=$? echo EXIT_CODE=$EXIT_CODE; echo End sleep: $(date); sleep
150-
infinity;'
151-
- name: tester
152-
image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
153-
imagePullPolicy: Always
154-
env: null
155-
command:
156-
- bash
157-
- -c
158-
- 'echo Start: $(date); _sigterm() ( kill -SIGTERM $! 2>/dev/null;); trap
159-
_sigterm SIGTERM; for i in {1..5}; do echo Sending request $i; time python3
160-
JetStream/jetstream/tools/requester.py --tokenizer assets/tokenizer.llama2
161-
--max_tokens=16 --server=0.0.0.0 --text="why earth is round"; EXIT_CODE=$?;
162-
echo Completed request; echo EXIT_CODE=$EXIT_CODE; if [[ $EXIT_CODE -ne
163-
0 ]]; then break; fi; done; echo Last EXIT_CODE=$EXIT_CODE; echo End sleep:
164-
$(date); sleep infinity;'
165-
securityContext:
166-
privileged: true
167-
- name: worker # Part of the name of the child Jobs (<replicateJobName>)
168-
replicas: 1 # Number of slices
169-
template:
170-
spec:
171-
parallelism: 2 # Must be set to number of nodes in each node pool
172-
completions: 2 # Must be set to number of nodes in each node pool
173-
backoffLimit: 0 # Must be set to 0. Fail the job when any pod fails.
174-
template:
175-
spec:
176-
nodeSelector:
177-
cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
178-
cloud.google.com/gke-tpu-topology: 2x2x2
179-
volumes:
180-
- name: shared-tmp
181-
hostPath:
182-
path: /tmp
183-
type: DirectoryOrCreate
184-
containers:
185-
- name: pathways-worker
186-
securityContext:
187-
privileged: true
188-
image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/server:latest
189-
imagePullPolicy: Always # Sometimes k8s was reusing the old image
190-
env:
191-
- name: TPU_MIN_LOG_LEVEL
192-
value: "0"
193-
- name: TF_CPP_MIN_LOG_LEVEL
194-
value: "0"
195-
- name: XCLOUD_ENVIRONMENT
196-
value: GCP
197-
args:
198-
- --alsologtostderr
199-
- --pathways_server_port=38679 # changed to not match rm port
200-
- --pathways_resource_manager=pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38677
201-
- --pathways_persistent_compilation_cache=false
202-
- --pathways_compilation_mode=compile_at_worker
203-
- --xla_tpu_enable_data_parallel_all_reduce_opt=true
204-
- --xla_tpu_data_parallel_opt_different_sized_ops=true
205-
- --xla_tpu_enable_async_collective_fusion=true
206-
- --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true
207-
- --xla_tpu_enable_async_collective_fusion_multiple_steps=true
208-
- --xla_tpu_overlap_compute_collective_tc=true
209-
- --xla_enable_async_all_gather=true
210-
- --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp
211-
ports:
212-
- containerPort: 38679
213-
- containerPort: 38680
214-
- containerPort: 8471
215-
- containerPort: 8080
216-
resources:
217-
limits:
218-
google.com/tpu: 4 # Number of TPU chips per worker

config/samples/pathways-job_v1_pathwaysjob.yaml

Lines changed: 80 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,19 @@
1515
apiVersion: pathways-job.pathways.domain/v1
1616
kind: PathwaysJob
1717
metadata:
18-
name: pathways-trial39
18+
name: pathways-trial62
1919
spec:
20-
maxRestarts: 4
20+
maxRestarts: 10
2121
workers:
2222
- type: tpu-v4-podslice
23-
topology: 2x2x1
24-
numSlices: 1
23+
topology: 2x2x2
24+
numSlices: 2
2525
pathwaysDir: "gs://cloud-pathways-staging/tmp"
2626
controller:
27-
deploymentMode: "colocate"
28-
# deploymentMode: "default"
29-
template:
27+
# #Pod template for training, default mode.
28+
29+
deploymentMode: default
30+
template: # UserPodTemplate
3031
spec:
3132
containers:
3233
- name: user
@@ -36,8 +37,7 @@ spec:
3637
- name: JAX_PLATFORMS
3738
value: proxy
3839
- name: JAX_BACKEND_TARGET
39-
value: grpc://pathways-trial39-leader-0-0.pathways-trial39:29008
40-
# value: grpc://pathways-trial38-proxy-0-0.pathways-trial38:29008
40+
value: grpc://pathways-trial62-proxy-0-0.pathways-trial62:29008
4141
image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
4242
imagePullPolicy: Always
4343
command:
@@ -48,7 +48,74 @@ spec:
4848
volumeMounts:
4949
- mountPath: /tmp
5050
name: shared-tmp
51-
# resources:
52-
# limits:
53-
# cpu: "20"
54-
# memory: 90G
51+
resources:
52+
limits:
53+
cpu: "20"
54+
memory: 90G
55+
56+
57+
58+
# #Pod template for inference, colocate mode.
59+
60+
61+
# deploymentMode: colocate
62+
# template: # UserPodTemplate
63+
# spec:
64+
# containers:
65+
# - name: jetstream
66+
# image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
67+
# imagePullPolicy: Always
68+
# ports:
69+
# - containerPort: 9000
70+
# env:
71+
# - name: XCLOUD_ENVIRONMENT
72+
# value: GCP
73+
# - name: JAX_PLATFORMS
74+
# value: proxy
75+
# - name: JAX_BACKEND_TARGET
76+
# value: grpc://pathways-trial61-leader-0-0.pathways-trial61:29008
77+
# command:
78+
# - bash
79+
# - -c
80+
# - 'echo Start: $(date);
81+
# _sigterm() ( kill -SIGTERM $! 2>/dev/null;);
82+
# trap _sigterm SIGTERM;
83+
# (JAX_TRACEBACK_FILTERING=off python3 MaxText/maxengine_server.py
84+
# MaxText/configs/inference_jetstream.yml tokenizer_path=assets/tokenizer.llama2
85+
# load_parameters_path=gs://runner-maxtext-logs/2024-05-07-23-34/unscanned_chkpt/checkpoints/0/items
86+
# max_prefill_predict_length=1024 max_target_length=2048 async_checkpointing=false
87+
# model_name=''llama2-70b'' steps=1 ici_fsdp_parallelism=1 ici_autoregressive_parallelism=-1
88+
# ici_tensor_parallelism=1 scan_layers=false weight_dtype=bfloat16
89+
# per_device_batch_size=2) & PID=$!;
90+
# while kill -0 $PID 2>/dev/null;
91+
# do sleep 5;
92+
# done;
93+
# wait $PID;
94+
# EXIT_CODE=$?
95+
# echo EXIT_CODE=$EXIT_CODE;
96+
# echo End sleep: $(date);
97+
# sleep infinity;'
98+
# - name: tester
99+
# image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
100+
# imagePullPolicy: Always
101+
# command:
102+
# - bash
103+
# - -c
104+
# - 'echo Start: $(date);
105+
# _sigterm() ( kill -SIGTERM $! 2>/dev/null;);
106+
# trap _sigterm SIGTERM;
107+
# for i in {1..5}; do
108+
# echo Sending request $i;
109+
# time python3 JetStream/jetstream/tools/requester.py --tokenizer assets/tokenizer.llama2 --max_tokens=16 --server=0.0.0.0 --text=\"why earth is round\";
110+
# EXIT_CODE=$?;
111+
# echo Completed request;
112+
# echo EXIT_CODE=$EXIT_CODE;
113+
# if [[ $EXIT_CODE -ne 0 ]]; then
114+
# break;
115+
# fi;
116+
# done;
117+
# echo Last EXIT_CODE=$EXIT_CODE;
118+
# echo End sleep: $(date);
119+
# sleep infinity;'
120+
# securityContext:
121+
# privileged: true

0 commit comments

Comments
 (0)