5858var WorkerTypeToTPUVersionMap = map [string ]string {
5959 "tpu-v6e-slice" : "tpuv6e" ,
6060 "tpu-v5p-slice" : "tpuv5" ,
61- "tpu-v5-lite-podslice" : "tpuv5 " ,
61+ "tpu-v5-lite-podslice" : "tpuv5e " ,
6262 "tpu-v4-podslice" : "tpuv4" ,
6363}
6464
@@ -278,17 +278,15 @@ func validateTPUTopologyWithWorkerType(ctx context.Context, tpuGKEAcceleratorTyp
278278// Calculate the number of VMs based on the Topology (- used in completions/parallelisms)
279279func calculateVMsFromTopology (topology string ) int32 {
280280 parts := strings .Split (topology , "x" ) // Examples - 2x2x4 or 4x4
281- if len (parts ) < 2 {
282- return 0
283- }
284281 // Calculate the number of chips based on the Topology.
282+ // The topology must have already been validated with the worker type.
285283 chips := 1
286284 for _ , part := range parts {
287285 num , _ := strconv .Atoi (part )
288286 chips *= num
289287 }
290288 vms := 1
291- chipsperVM := 4
289+ chipsperVM := 4 // ToDo (roshanin): Add support for VMs with 8 chips per host.
292290 if chips >= chipsperVM {
293291 vms = chips / chipsperVM
294292 }
@@ -308,13 +306,24 @@ func calculateTPUInfo(ctx context.Context, pw *pathwaysjob.PathwaysJob) error {
308306 return nil
309307}
310308
309+ // Construct image tag based on Pathways version
310+ func makeImageTagUsingPathwaysVersion (pw * pathwaysjob.PathwaysJob ) string {
311+ var tag string
312+ if pw .Spec .PathwaysVersion != "" {
313+ tag = string (pw .Spec .PathwaysVersion )
314+ } else {
315+ tag = "latest"
316+ }
317+ return tag
318+ }
319+
311320// Constructs the Pathways resource manager container spec for the underlying JobSet
312321func MakeResourceManagerContainer (pw * pathwaysjob.PathwaysJob , rmJobName string ) (* corev1.Container , error ) {
313322 truth := true
314323
315324 rmContainerSpec := corev1.Container {
316325 Name : "pathways-rm" ,
317- Image : "us-docker.pkg.dev/cloud-tpu-v2-images-dev /pathways/sanitized_server:latest" ,
326+ Image : fmt . Sprintf ( "us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:%s" , makeImageTagUsingPathwaysVersion ( pw )) ,
318327 ImagePullPolicy : "Always" ,
319328 SecurityContext : & corev1.SecurityContext {Privileged : & truth },
320329 Args : []string {
@@ -342,7 +351,7 @@ func MakeProxyContainer(pw *pathwaysjob.PathwaysJob, rmJobName string) (*corev1.
342351
343352 proxyContainerSpec := corev1.Container {
344353 Name : "pathways-proxy" ,
345- Image : "us-docker.pkg.dev/cloud-tpu-v2-images-dev /pathways/sanitized_proxy_server:latest" ,
354+ Image : fmt . Sprintf ( "us-docker.pkg.dev/cloud-tpu-v2-images/pathways/proxy_server:%s" , makeImageTagUsingPathwaysVersion ( pw )) ,
346355 ImagePullPolicy : "Always" ,
347356 SecurityContext : & corev1.SecurityContext {Privileged : & truth },
348357 Args : []string {
@@ -360,6 +369,15 @@ func MakeProxyContainer(pw *pathwaysjob.PathwaysJob, rmJobName string) (*corev1.
360369func MakeWorkerJob (ctx context.Context , pw * pathwaysjob.PathwaysJob , rmJobName string ) (jobsetv1alpha2.ReplicatedJob , error ) {
361370 truth := true
362371 volumeSourceType := corev1 .HostPathDirectoryOrCreate
372+ objectMeta := metav1.ObjectMeta {}
373+
374+ if pw .Spec .Controller .DeploymentMode == pathwaysjob .Default {
375+ objectMeta = metav1.ObjectMeta {
376+ Annotations : map [string ]string {
377+ "alpha.jobset.sigs.k8s.io/exclusive-topology" : "cloud.google.com/gke-nodepool" ,
378+ },
379+ }
380+ }
363381
364382 workerJob := jobsetv1alpha2.ReplicatedJob {
365383 Name : "worker" ,
@@ -370,11 +388,12 @@ func MakeWorkerJob(ctx context.Context, pw *pathwaysjob.PathwaysJob, rmJobName s
370388 Completions : ptr .To (int32 (NumVMs )), // number of workers remember to change
371389 Parallelism : ptr .To (int32 (NumVMs )), // number of workers remember to change
372390 Template : corev1.PodTemplateSpec {
391+ ObjectMeta : objectMeta ,
373392 Spec : corev1.PodSpec {
374393 Containers : []corev1.Container {
375394 {
376395 Name : "pathways-worker" ,
377- Image : "us-docker.pkg.dev/cloud-tpu-v2-images-dev /pathways/sanitized_server:latest" ,
396+ Image : fmt . Sprintf ( "us-docker.pkg.dev/cloud-tpu-v2-images/pathways/server:%s" , makeImageTagUsingPathwaysVersion ( pw )) ,
378397 ImagePullPolicy : "Always" ,
379398 SecurityContext : & corev1.SecurityContext {Privileged : & truth },
380399 Args : []string {
@@ -549,6 +568,10 @@ func MakeJobsForDefaultDeployment(ctx context.Context, pw *pathwaysjob.PathwaysJ
549568 Parallelism : ptr .To (int32 (1 )),
550569 Template : corev1.PodTemplateSpec {
551570 Spec : corev1.PodSpec {
571+ NodeSelector : map [string ]string { // predictably place RM on CPUs
572+ "cloud.google.com/machine-family" : "n2" ,
573+ "node.kubernetes.io/instance-type" : "n2-standard-64" ,
574+ },
552575 HostNetwork : true , // For performance == McJAX
553576 DNSPolicy : corev1 .DNSClusterFirstWithHostNet , // For performance == McJAX
554577 Tolerations : []corev1.Toleration {
@@ -586,6 +609,10 @@ func MakeJobsForDefaultDeployment(ctx context.Context, pw *pathwaysjob.PathwaysJ
586609 Parallelism : ptr .To (int32 (1 )),
587610 Template : corev1.PodTemplateSpec {
588611 Spec : corev1.PodSpec {
612+ NodeSelector : map [string ]string { // predictably place RM on CPUs
613+ "cloud.google.com/machine-family" : "n2" ,
614+ "node.kubernetes.io/instance-type" : "n2-standard-64" ,
615+ },
589616 HostNetwork : true , // For performance == McJAX
590617 DNSPolicy : corev1 .DNSClusterFirstWithHostNet , // For performance == McJAX
591618 Tolerations : []corev1.Toleration {
@@ -629,6 +656,10 @@ func MakeJobsForDefaultDeployment(ctx context.Context, pw *pathwaysjob.PathwaysJ
629656 // },
630657 Template : corev1.PodTemplateSpec {
631658 Spec : corev1.PodSpec {
659+ NodeSelector : map [string ]string { // predictably place RM on CPUs
660+ "cloud.google.com/machine-family" : "n2" ,
661+ "node.kubernetes.io/instance-type" : "n2-standard-64" ,
662+ },
632663 HostNetwork : true , // For performance == McJAX
633664 DNSPolicy : corev1 .DNSClusterFirstWithHostNet , // For performance == McJAX
634665 Tolerations : []corev1.Toleration { // tolerations are important here to not run this job on TPUs
0 commit comments