Add spec maps for concrete validations, fix headless mode, cleanup.

RoshaniN · RoshaniN · commit 2b9eac4963a0 · 2025-03-11T02:11:32.000Z
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # pathways-job
 PathwaysJob API is an OSS Kubernetes-native API, to deploy ML training and batch inference workloads, using Pathways on GKE. 
-//ToDo(roshanin) - an intro of what Pathways is.
+//ToDo(roshanin) - add intro for Pathways.
 ## Description
 The PathwaysJob is an API that provides an easy way to run JAX workloads using Pathways. It support two modes of deployment.
 ### Colocate mode
diff --git a/api/v1/pathwaysjob_types.go b/api/v1/pathwaysjob_types.go
@@ -21,7 +21,6 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
-// EDIT THIS FILE!  THIS IS SCAFFOLDING FOR YOU TO OWN!
 // NOTE: json tags are required.  Any new fields you add must have json tags for the fields to be serialized.
 
 // +kubebuilder:object:root=true
diff --git a/config/samples/jobset_example.yaml b/config/samples/jobset_example.yaml
@@ -1,218 +0,0 @@
-# Copyright 2025 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: jobset.x-k8s.io/v1alpha2
-kind: JobSet
-metadata:
-  name: pathways-jobset-inference
-  # annotations:
-  #   alpha.jobset.sigs.k8s.io/exclusive-topology: cloud.google.com/gke-nodepool    # 1:1 job replica to node pool assignment
-spec:
-  failurePolicy:
-    maxRestarts: 4  # The set will be restarted on failures up to 4 times.
-  replicatedJobs:
-  - name: leader   # Part of the name of the child Jobs (<replicateJobName>)
-    replicas: 1    # Replicas of the Pathways Resource Manager, Proxy, JetStream and Tester. Should always be 1.
-    template:
-      spec: # JobSpec
-        parallelism: 1   # Must be set to number of nodes in each node pool
-        completions: 1   # Must be set to number of nodes in each node pool
-        backoffLimit: 0   # Must be set to 0. Fail the job when any pod fails.
-        template:
-          spec:
-            affinity:
-              podAffinity:
-                requiredDuringSchedulingIgnoredDuringExecution:
-                - labelSelector:
-                    matchExpressions:
-                    - key: jobset.sigs.k8s.io/jobset-name
-                      operator: In
-                      values:
-                      - pathways-jobset-inference
-                  topologyKey: cloud.google.com/gke-nodepool
-              podAntiAffinity: # ensures only this job lands on the rack
-                requiredDuringSchedulingIgnoredDuringExecution:
-                - labelSelector:
-                    matchExpressions:
-                    - key: jobset.sigs.k8s.io/jobset-name
-                      operator: NotIn
-                      values:
-                      - pathways-jobset-inference
-                    - key: job-name
-                      operator: Exists
-                  namespaceSelector: {}
-                  topologyKey: cloud.google.com/gke-nodepool
-            nodeSelector:
-              cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-              cloud.google.com/gke-tpu-topology: 2x2x2
-            tolerations:
-            - effect: NoSchedule
-              key: google.com/tpu
-              operator: Exists
-            # nodeSelector:
-            #   cloud.google.com/gke-nodepool: cpu-user-np
-            volumes:
-            - name: shared-tmp
-              hostPath:
-                path: /tmp
-                type: DirectoryOrCreate
-            containers:
-            - name: pathways-rm
-              image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/server:latest
-              imagePullPolicy: Always  # Sometimes k8s was reusing the old image
-              args:
-              - --alsologtostderr
-              - --pathways_server_port=38677
-              - --pathways_server_provides_devices=false
-              - --pathways_device_type=NONE
-              - --pathways_persistent_compilation_cache=false
-              - --pathways_compilation_mode=compile_at_worker
-              - --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp
-              - --pathways_expected_instances=tpuv4:2x2x2
-              env:
-              - name: TPU_SKIP_MDS_QUERY
-                value: "true"
-              - name: REPLICATED_JOB_NAME
-                valueFrom:
-                  fieldRef:
-                    fieldPath: metadata.annotations['jobset.sigs.k8s.io/replicatedjob-name']
-              - name: JOBSET_NAME
-                valueFrom:
-                  fieldRef:
-                    fieldPath: metadata.annotations['jobset.sigs.k8s.io/jobset-name']
-              - name: HOST_ADDRESS
-                value: $(JOBSET_NAME)-$(REPLICATED_JOB_NAME)-0-0.$(JOBSET_NAME)
-              ports:
-              - containerPort: 38677
-              - containerPort: 38678
-              resources:
-                limits:
-                  cpu: "4"
-                  memory: "8G"
-              securityContext:
-                privileged: true
-            - name: pathways-proxy
-              args:
-              - --alsologtostderr
-              - --v=0
-              - --pathways_ifrt_proxy_server_resource_manager=pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38677
-              # - --pathways_ifrt_proxy_server_resource_manager=localhost:38677
-              - --pathways_ifrt_proxy_server_port=38681
-              - --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp
-              - --pathways_plaque_network=gcp
-              image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/proxy_server:latest
-              imagePullPolicy: Always
-              ports:
-              - containerPort: 38681
-              - containerPort: 38682
-              resources:
-                limits:
-                  cpu: "4"
-                  memory: 10G
-              securityContext:
-                privileged: true
-            - name: jetstream
-              image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
-              imagePullPolicy: Always
-              ports:
-              - containerPort: 9000
-              env:
-              - name: XCLOUD_ENVIRONMENT
-                value: GCP
-              - name: JAX_PLATFORMS
-                value: proxy
-              - name: JAX_BACKEND_TARGET
-                value: grpc://pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38681
-                # value: grpc://localhost:38681
-              command:
-                - bash
-                - -c
-                - 'echo Start: $(date); _sigterm() ( kill -SIGTERM $! 2>/dev/null;); trap
-                  _sigterm SIGTERM; (JAX_TRACEBACK_FILTERING=off python3 MaxText/maxengine_server.py
-                  MaxText/configs/inference_jetstream.yml tokenizer_path=assets/tokenizer.llama2
-                  load_parameters_path=gs://runner-maxtext-logs/2024-05-07-23-34/unscanned_chkpt/checkpoints/0/items
-                  max_prefill_predict_length=1024 max_target_length=2048 async_checkpointing=false
-                  model_name=''llama2-70b'' steps=1 ici_fsdp_parallelism=1 ici_autoregressive_parallelism=-1
-                  ici_tensor_parallelism=1 scan_layers=false weight_dtype=bfloat16 per_device_batch_size=2)
-                  & PID=$!; while kill -0 $PID 2>/dev/null; do sleep 5; done; wait $PID;
-                  EXIT_CODE=$? echo EXIT_CODE=$EXIT_CODE; echo End sleep: $(date); sleep
-                  infinity;'
-            - name: tester
-              image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
-              imagePullPolicy: Always
-              env: null
-              command:
-              - bash
-              - -c
-              - 'echo Start: $(date); _sigterm() ( kill -SIGTERM $! 2>/dev/null;); trap
-                _sigterm SIGTERM; for i in {1..5}; do echo Sending request $i; time python3
-                JetStream/jetstream/tools/requester.py --tokenizer assets/tokenizer.llama2
-                --max_tokens=16 --server=0.0.0.0 --text="why earth is round"; EXIT_CODE=$?;
-                echo Completed request; echo EXIT_CODE=$EXIT_CODE; if [[ $EXIT_CODE -ne
-                0 ]]; then break; fi; done; echo Last EXIT_CODE=$EXIT_CODE; echo End sleep:
-                $(date); sleep infinity;'
-              securityContext:
-                privileged: true
-  - name: worker  # Part of the name of the child Jobs (<replicateJobName>)
-    replicas: 1  # Number of slices
-    template:
-      spec:
-        parallelism: 2   # Must be set to number of nodes in each node pool
-        completions: 2   # Must be set to number of nodes in each node pool
-        backoffLimit: 0   # Must be set to 0. Fail the job when any pod fails.
-        template:
-          spec:
-            nodeSelector:
-              cloud.google.com/gke-tpu-accelerator: tpu-v4-podslice
-              cloud.google.com/gke-tpu-topology: 2x2x2
-            volumes:
-            - name: shared-tmp
-              hostPath:
-                path: /tmp
-                type: DirectoryOrCreate
-            containers:
-            - name: pathways-worker
-              securityContext:
-                privileged: true
-              image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/server:latest
-              imagePullPolicy: Always  # Sometimes k8s was reusing the old image
-              env:
-              - name: TPU_MIN_LOG_LEVEL
-                value: "0"
-              - name: TF_CPP_MIN_LOG_LEVEL
-                value: "0"
-              - name: XCLOUD_ENVIRONMENT
-                value: GCP
-              args:
-              - --alsologtostderr
-              - --pathways_server_port=38679 # changed to not match rm port
-              - --pathways_resource_manager=pathways-jobset-inference-leader-0-0.pathways-jobset-inference:38677
-              - --pathways_persistent_compilation_cache=false
-              - --pathways_compilation_mode=compile_at_worker
-              - --xla_tpu_enable_data_parallel_all_reduce_opt=true
-              - --xla_tpu_data_parallel_opt_different_sized_ops=true
-              - --xla_tpu_enable_async_collective_fusion=true
-              - --xla_tpu_enable_async_collective_fusion_fuse_all_gather=true
-              - --xla_tpu_enable_async_collective_fusion_multiple_steps=true
-              - --xla_tpu_overlap_compute_collective_tc=true
-              - --xla_enable_async_all_gather=true
-              - --pathways_tmp_dir_pattern=gs://cloud-pathways-staging/tmp
-              ports:
-              - containerPort: 38679
-              - containerPort: 38680
-              - containerPort: 8471
-              - containerPort: 8080
-              resources:
-                limits:
-                  google.com/tpu: 4  # Number of TPU chips per worker
diff --git a/config/samples/pathways-job_v1_pathwaysjob.yaml b/config/samples/pathways-job_v1_pathwaysjob.yaml
@@ -15,18 +15,19 @@
 apiVersion: pathways-job.pathways.domain/v1
 kind: PathwaysJob
 metadata:
-  name: pathways-trial39
+  name: pathways-trial62
 spec:
-  maxRestarts: 4
+  maxRestarts: 10
   workers:
   - type: tpu-v4-podslice
-    topology: 2x2x1
-    numSlices: 1
+    topology: 2x2x2
+    numSlices: 2
   pathwaysDir: "gs://cloud-pathways-staging/tmp"
   controller:
-    deploymentMode: "colocate"
-    # deploymentMode: "default"
-    template:
+    # #Pod template for training, default mode.
+
+    deploymentMode: default
+    template: # UserPodTemplate
       spec:
         containers:
         - name: user
@@ -36,8 +37,7 @@ spec:
           - name: JAX_PLATFORMS
             value: proxy
           - name: JAX_BACKEND_TARGET
-            value: grpc://pathways-trial39-leader-0-0.pathways-trial39:29008
-            # value: grpc://pathways-trial38-proxy-0-0.pathways-trial38:29008
+            value: grpc://pathways-trial62-proxy-0-0.pathways-trial62:29008
           image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
           imagePullPolicy: Always
           command:
@@ -48,7 +48,74 @@ spec:
           volumeMounts:
           - mountPath: /tmp
             name: shared-tmp
-          # resources:
-          #   limits:
-          #     cpu: "20"
-          #     memory: 90G
+          resources:
+            limits:
+              cpu: "20"
+              memory: 90G
+
+
+
+    # #Pod template for inference, colocate mode.
+
+
+    # deploymentMode: colocate
+    # template: # UserPodTemplate
+    #   spec:
+    #     containers:
+    #     - name: jetstream
+    #       image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
+    #       imagePullPolicy: Always
+    #       ports:
+    #       - containerPort: 9000
+    #       env:
+    #       - name: XCLOUD_ENVIRONMENT
+    #         value: GCP
+    #       - name: JAX_PLATFORMS
+    #         value: proxy
+    #       - name: JAX_BACKEND_TARGET
+    #         value: grpc://pathways-trial61-leader-0-0.pathways-trial61:29008
+    #       command:
+    #         - bash
+    #         - -c
+    #         - 'echo Start: $(date);
+    #           _sigterm() ( kill -SIGTERM $! 2>/dev/null;);
+    #           trap _sigterm SIGTERM;
+    #           (JAX_TRACEBACK_FILTERING=off python3 MaxText/maxengine_server.py
+    #           MaxText/configs/inference_jetstream.yml tokenizer_path=assets/tokenizer.llama2
+    #           load_parameters_path=gs://runner-maxtext-logs/2024-05-07-23-34/unscanned_chkpt/checkpoints/0/items
+    #           max_prefill_predict_length=1024 max_target_length=2048 async_checkpointing=false
+    #           model_name=''llama2-70b'' steps=1 ici_fsdp_parallelism=1 ici_autoregressive_parallelism=-1
+    #           ici_tensor_parallelism=1 scan_layers=false weight_dtype=bfloat16
+    #           per_device_batch_size=2) & PID=$!;
+    #           while kill -0 $PID 2>/dev/null;
+    #           do sleep 5;
+    #           done;
+    #           wait $PID;
+    #           EXIT_CODE=$?
+    #           echo EXIT_CODE=$EXIT_CODE;
+    #           echo End sleep: $(date);
+    #           sleep infinity;'
+    #     - name: tester
+    #       image: us-docker.pkg.dev/cloud-tpu-v2-images-dev/pathways/maxtext_jax_stable:latest
+    #       imagePullPolicy: Always
+    #       command:
+    #       - bash
+    #       - -c
+    #       - 'echo Start: $(date);
+    #         _sigterm() ( kill -SIGTERM $! 2>/dev/null;);
+    #         trap _sigterm SIGTERM;
+    #         for i in {1..5}; do
+    #           echo Sending request $i;
+    #           time python3 JetStream/jetstream/tools/requester.py --tokenizer assets/tokenizer.llama2 --max_tokens=16 --server=0.0.0.0 --text=\"why earth is round\";
+    #           EXIT_CODE=$?;
+    #           echo Completed request;
+    #           echo EXIT_CODE=$EXIT_CODE;
+    #           if [[ $EXIT_CODE -ne 0 ]]; then
+    #             break;
+    #           fi;
+    #         done;
+    #         echo Last EXIT_CODE=$EXIT_CODE;
+    #         echo End sleep: $(date);
+    #         sleep infinity;'
+    #       securityContext:
+    #         privileged: true
diff --git a/internal/controller/pathwaysjob_controller.go b/internal/controller/pathwaysjob_controller.go
diff --git a/pkg/utils/extra_prototype.go b/pkg/utils/extra_prototype.go

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,6 @@ import (`
`21`	`21`	`metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"`
`22`	`22`	`)`
`23`	`23`
`24`		`-// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN!`
`25`	`24`	`// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized.`
`26`	`25`
`27`	`26`	`// +kubebuilder:object:root=true`