diff --git a/api/v1alpha1/inferenceservice_types.go b/api/v1alpha1/inferenceservice_types.go index 3a483e2d..d73755e8 100644 --- a/api/v1alpha1/inferenceservice_types.go +++ b/api/v1alpha1/inferenceservice_types.go @@ -80,6 +80,27 @@ type SpeculativeDecodingSpec struct { NDraftMax *int32 `json:"nDraftMax,omitempty"` } +// ModelCacheSpec points this InferenceService's model cache at a user-managed +// PVC instead of the operator's shared/perService cache PVC. The operator +// mounts and populates the claim through the same prep + download init +// containers as the built-in cache, but never creates, mutates, or deletes it; +// the user owns the PVC end-to-end. +type ModelCacheSpec struct { + // ClaimName names a pre-existing PersistentVolumeClaim in the + // InferenceService's namespace to use as the writable model cache volume. + // Weights land under the usual / subdirectory of the claim, so + // RefreshPolicy and cache-key semantics are unchanged and multiple models + // can share one claim without colliding. The claim must already exist: + // when it is missing the InferenceService is marked Degraded rather than + // silently falling back to the shared cache. Ignored for pvc:// model + // sources (already staged, read-only, no download). Node alignment of + // RWO/local claims (via nodeSelector) is the user's responsibility. + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=253 + // +optional + ClaimName string `json:"claimName,omitempty"` +} + type InferenceServiceSpec struct { // ModelRef references the Model CR that contains the model to serve // +kubebuilder:validation:Required @@ -403,6 +424,14 @@ type InferenceServiceSpec struct { // +optional SkipModelInit *bool `json:"skipModelInit,omitempty"` + // ModelCache overrides where this InferenceService caches model weights: + // when claimName is set, the named user-owned PVC is mounted as the + // writable model cache (prep + download init containers run against it) + // instead of the operator's shared/perService cache PVC. When unset, the + // operator-global cache mode applies unchanged. + // +optional + ModelCache *ModelCacheSpec `json:"modelCache,omitempty"` + // PersonaPlexConfig holds configuration for the PersonaPlex (Moshi) runtime. // Only used when Runtime is "personaplex". // +optional diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index ebd96fed..b3216776 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -624,6 +624,11 @@ func (in *InferenceServiceSpec) DeepCopyInto(out *InferenceServiceSpec) { *out = new(bool) **out = **in } + if in.ModelCache != nil { + in, out := &in.ModelCache, &out.ModelCache + *out = new(ModelCacheSpec) + **out = **in + } if in.PersonaPlexConfig != nil { in, out := &in.PersonaPlexConfig, &out.PersonaPlexConfig *out = new(PersonaPlexConfig) @@ -794,6 +799,21 @@ func (in *Model) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ModelCacheSpec) DeepCopyInto(out *ModelCacheSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ModelCacheSpec. +func (in *ModelCacheSpec) DeepCopy() *ModelCacheSpec { + if in == nil { + return nil + } + out := new(ModelCacheSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ModelList) DeepCopyInto(out *ModelList) { *out = *in diff --git a/charts/llmkube/templates/crds/inferenceservices.yaml b/charts/llmkube/templates/crds/inferenceservices.yaml index 13d64d59..178bc152 100644 --- a/charts/llmkube/templates/crds/inferenceservices.yaml +++ b/charts/llmkube/templates/crds/inferenceservices.yaml @@ -1490,6 +1490,29 @@ spec: - embedding - rerank type: string + modelCache: + description: |- + ModelCache overrides where this InferenceService caches model weights: + when claimName is set, the named user-owned PVC is mounted as the + writable model cache (prep + download init containers run against it) + instead of the operator's shared/perService cache PVC. When unset, the + operator-global cache mode applies unchanged. + properties: + claimName: + description: |- + ClaimName names a pre-existing PersistentVolumeClaim in the + InferenceService's namespace to use as the writable model cache volume. + Weights land under the usual / subdirectory of the claim, so + RefreshPolicy and cache-key semantics are unchanged and multiple models + can share one claim without colliding. The claim must already exist: + when it is missing the InferenceService is marked Degraded rather than + silently falling back to the shared cache. Ignored for pvc:// model + sources (already staged, read-only, no download). Node alignment of + RWO/local claims (via nodeSelector) is the user's responsibility. + maxLength: 253 + minLength: 1 + type: string + type: object modelRef: description: ModelRef references the Model CR that contains the model to serve diff --git a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml index 71d62c29..03e01b34 100644 --- a/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml +++ b/config/crd/bases/inference.llmkube.dev_inferenceservices.yaml @@ -1486,6 +1486,29 @@ spec: - embedding - rerank type: string + modelCache: + description: |- + ModelCache overrides where this InferenceService caches model weights: + when claimName is set, the named user-owned PVC is mounted as the + writable model cache (prep + download init containers run against it) + instead of the operator's shared/perService cache PVC. When unset, the + operator-global cache mode applies unchanged. + properties: + claimName: + description: |- + ClaimName names a pre-existing PersistentVolumeClaim in the + InferenceService's namespace to use as the writable model cache volume. + Weights land under the usual / subdirectory of the claim, so + RefreshPolicy and cache-key semantics are unchanged and multiple models + can share one claim without colliding. The claim must already exist: + when it is missing the InferenceService is marked Degraded rather than + silently falling back to the shared cache. Ignored for pvc:// model + sources (already staged, read-only, no download). Node alignment of + RWO/local claims (via nodeSelector) is the user's responsibility. + maxLength: 253 + minLength: 1 + type: string + type: object modelRef: description: ModelRef references the Model CR that contains the model to serve diff --git a/docs/MODEL-CACHE.md b/docs/MODEL-CACHE.md index cd857575..55b554a0 100644 --- a/docs/MODEL-CACHE.md +++ b/docs/MODEL-CACHE.md @@ -145,6 +145,46 @@ modelCache: accessMode: ReadWriteMany ``` +### Per-InferenceService Cache PVC (Bring Your Own) + +The cache backend above is an operator-global choice. To point a *single* +InferenceService at its own pre-existing, user-owned PVC — for example a +node-local volume for a large model pinned to one node, while everything else +rides the shared cache — set `spec.modelCache.claimName`: + +```yaml +apiVersion: inference.llmkube.dev/v1alpha1 +kind: InferenceService +metadata: + name: llama-3.1-70b +spec: + modelRef: llama-3.1-70b + modelCache: + claimName: my-model-cache # pre-existing PVC in the same namespace +``` + +Behavior: + +- The named PVC becomes the writable model cache for this workload only: the + same `model-cache-prep` and `model-downloader` init containers run against + it, weights land under the usual `/` subdirectory, and the serving + container mounts it read-only. `RefreshPolicy` and cache-key semantics are + unchanged, so multiple models can safely share one claim. +- The operator **never creates or deletes** the claim — you own it end-to-end + (unlike `perService` mode, where the operator provisions and + garbage-collects `-model-cache`). If the claim does not exist, the + InferenceService is marked `Degraded` with a `ModelCachePVCNotFound` event + instead of silently falling back to the shared cache. +- `claimName` targets the download path, so it is ignored for pre-staged + `pvc://` model sources (mounted read-only, no download); a warning event is + emitted if both are set. +- Node alignment is your responsibility: for an RWO or node-local claim, use + `nodeSelector` so the pod lands where the PVC binds (a + `WaitForFirstConsumer` local class binds on the first consumer; a pre-bound + RWO PVC pins the pod). +- `llmkube cache list` / `cache clear` inspect the shared cache only; they do + not see bring-your-own cache PVCs. + ## CLI Commands ### List Cached Models diff --git a/internal/controller/inferenceservice_controller.go b/internal/controller/inferenceservice_controller.go index 41c1754c..16f4344e 100644 --- a/internal/controller/inferenceservice_controller.go +++ b/internal/controller/inferenceservice_controller.go @@ -18,6 +18,7 @@ package controller import ( "context" + "fmt" "net/http" "strings" "time" @@ -167,10 +168,21 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req if effectiveModelCacheKey(model) != "" && r.ModelCachePath != "" { if err := r.ensureModelCachePVC(ctx, inferenceService); err != nil { log.Error(err, "Failed to ensure model cache PVC exists", "namespace", inferenceService.Namespace) - return r.updateStatusWithSchedulingInfo(ctx, inferenceService, PhaseFailed, modelReady, 0, desiredReplicas, "", "Failed to create model cache PVC", nil) + return r.updateStatusWithSchedulingInfo(ctx, inferenceService, PhaseFailed, modelReady, 0, desiredReplicas, "", + fmt.Sprintf("Failed to ensure model cache PVC: %v", err), nil) } } + // spec.modelCache.claimName targets the download path, so it is meaningless + // for a pre-staged pvc:// source (mounted read-only, no download). The + // claimName is ignored in that case; surface a Warning so the conflict is + // visible instead of silently dropped. + if r.Recorder != nil && userModelCacheClaimName(inferenceService) != "" && isPVCSource(model.Spec.Source) { + r.Recorder.Eventf(inferenceService, nil, corev1.EventTypeWarning, "ModelCacheClaimIgnored", "Reconcile", + "spec.modelCache.claimName is ignored: model source %q is a pre-staged pvc:// volume (read-only, no download)", + model.Spec.Source) + } + isMetal := isMetalModel(model) if r.Recorder != nil && needsOffloadMemoryWarning(inferenceService) { diff --git a/internal/controller/inferenceservice_storage_test.go b/internal/controller/inferenceservice_storage_test.go index 16683235..be206771 100644 --- a/internal/controller/inferenceservice_storage_test.go +++ b/internal/controller/inferenceservice_storage_test.go @@ -23,6 +23,7 @@ import ( . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/types" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -767,6 +768,166 @@ var _ = Describe("buildCachedStorageConfig cache mode selection (#728)", func() }) }) +var _ = Describe("buildCachedStorageConfig user claimName override (#928)", func() { + model := &inferencev1alpha1.Model{ + Spec: inferencev1alpha1.ModelSpec{Source: "https://example.com/model.gguf"}, + Status: inferencev1alpha1.ModelStatus{CacheKey: "abc123def456"}, + } + isvcWithClaim := func() *inferencev1alpha1.InferenceService { + return &inferencev1alpha1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{Name: "byo-isvc"}, + Spec: inferencev1alpha1.InferenceServiceSpec{ + ModelCache: &inferencev1alpha1.ModelCacheSpec{ClaimName: "my-model-cache"}, + }, + } + } + + It("mounts the user PVC instead of the shared PVC in shared mode", func() { + config := buildCachedStorageConfig(model, isvcWithClaim(), ModelCacheModeShared, "", "curl:8.18.0", 102) + Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache")) + }) + + It("mounts the user PVC instead of the per-isvc PVC in perService mode", func() { + config := buildCachedStorageConfig(model, isvcWithClaim(), ModelCacheModePerService, "", "curl:8.18.0", 102) + Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache")) + }) + + It("keeps the cache layout and init containers identical to the built-in cache path", func() { + config := buildCachedStorageConfig(model, isvcWithClaim(), "", "", "curl:8.18.0", 102) + + // Weights still land under /, not the PVC root. + Expect(config.modelPath).To(Equal("/models/abc123def456/model.gguf")) + // Same prep + downloader init containers, mounted read-write. + Expect(config.initContainers).To(HaveLen(2)) + Expect(config.initContainers[0].Name).To(Equal("model-cache-prep")) + Expect(config.initContainers[1].Name).To(Equal("model-downloader")) + Expect(config.initContainers[1].VolumeMounts[0].ReadOnly).To(BeFalse()) + // The main container mounts the user PVC read-only. + Expect(config.volumeMounts[0].MountPath).To(Equal("/models")) + Expect(config.volumeMounts[0].ReadOnly).To(BeTrue()) + }) + + It("uses the user PVC for multi-file staged models too", func() { + staged := &inferencev1alpha1.Model{ + ObjectMeta: metav1.ObjectMeta{Name: "staged", Namespace: "default"}, + Spec: inferencev1alpha1.ModelSpec{ + Source: "hf://org/repo-GGUF", + Files: []string{"model-Q4_K_M.gguf"}, + }, + } + config := buildCachedStorageConfig(staged, isvcWithClaim(), "", "", "curl:8.18.0", 102) + Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache")) + }) + + It("does not affect an InferenceService without modelCache (shared PVC as before)", func() { + isvc := &inferencev1alpha1.InferenceService{ObjectMeta: metav1.ObjectMeta{Name: "plain-isvc"}} + config := buildCachedStorageConfig(model, isvc, ModelCacheModeShared, "", "curl:8.18.0", 102) + Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal(ModelCachePVCName)) + }) +}) + +var _ = Describe("ensureModelCachePVC (user claimName, #928)", func() { + var reconciler *InferenceServiceReconciler + var isvc *inferencev1alpha1.InferenceService + const userClaim = "byo-model-cache" + + forceDeletePVC := func(name string) { + ctx := context.Background() + pvc := &corev1.PersistentVolumeClaim{} + key := types.NamespacedName{Name: name, Namespace: "default"} + if err := k8sClient.Get(ctx, key, pvc); err != nil { + return + } + if len(pvc.Finalizers) > 0 { + pvc.Finalizers = nil + _ = k8sClient.Update(ctx, pvc) + } + _ = k8sClient.Delete(ctx, pvc) + Eventually(func() bool { + return errors.IsNotFound(k8sClient.Get(ctx, key, &corev1.PersistentVolumeClaim{})) + }, "5s", "100ms").Should(BeTrue()) + } + + createUserPVC := func() { + pvc := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{Name: userClaim, Namespace: "default"}, + Spec: corev1.PersistentVolumeClaimSpec{ + AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{corev1.ResourceStorage: resource.MustParse("10Gi")}, + }, + }, + } + Expect(k8sClient.Create(context.Background(), pvc)).To(Succeed()) + } + + BeforeEach(func() { + forceDeletePVC(ModelCachePVCName) + forceDeletePVC(userClaim) + reconciler = &InferenceServiceReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + ModelCacheMode: ModelCacheModeShared, + } + isvc = &inferencev1alpha1.InferenceService{ + ObjectMeta: metav1.ObjectMeta{Name: "byo-cache-isvc", Namespace: "default"}, + Spec: inferencev1alpha1.InferenceServiceSpec{ + ModelRef: "some-model", + ModelCache: &inferencev1alpha1.ModelCacheSpec{ClaimName: userClaim}, + }, + } + }) + + AfterEach(func() { + forceDeletePVC(ModelCachePVCName) + forceDeletePVC(userClaim) + }) + + It("succeeds without creating any operator PVC when the user PVC exists", func() { + createUserPVC() + Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed()) + + // Neither the shared nor a per-isvc cache PVC is created. + shared := &corev1.PersistentVolumeClaim{} + err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ModelCachePVCName, Namespace: "default"}, shared) + Expect(errors.IsNotFound(err)).To(BeTrue()) + perISVC := &corev1.PersistentVolumeClaim{} + err = k8sClient.Get(context.Background(), types.NamespacedName{Name: isvc.Name + "-model-cache", Namespace: "default"}, perISVC) + Expect(errors.IsNotFound(err)).To(BeTrue()) + }) + + It("never adopts or mutates the user PVC (no owner refs, no operator labels)", func() { + createUserPVC() + Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed()) + + pvc := &corev1.PersistentVolumeClaim{} + Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: userClaim, Namespace: "default"}, pvc)).To(Succeed()) + Expect(pvc.OwnerReferences).To(BeEmpty()) + Expect(pvc.Labels).NotTo(HaveKey("app.kubernetes.io/managed-by")) + }) + + It("does not create the user PVC and errors clearly when it is missing", func() { + err := reconciler.ensureModelCachePVC(context.Background(), isvc) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring(userClaim)) + Expect(err.Error()).To(ContainSubstring("spec.modelCache.claimName")) + + pvc := &corev1.PersistentVolumeClaim{} + getErr := k8sClient.Get(context.Background(), types.NamespacedName{Name: userClaim, Namespace: "default"}, pvc) + Expect(errors.IsNotFound(getErr)).To(BeTrue()) + }) + + It("overrides perService mode as well (no -model-cache created)", func() { + reconciler.ModelCacheMode = ModelCacheModePerService + createUserPVC() + Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed()) + + perISVC := &corev1.PersistentVolumeClaim{} + err := k8sClient.Get(context.Background(), types.NamespacedName{Name: isvc.Name + "-model-cache", Namespace: "default"}, perISVC) + Expect(errors.IsNotFound(err)).To(BeTrue()) + }) +}) + var _ = Describe("resolveCacheMode", func() { It("maps an empty mode to the shared default", func() { Expect(resolveCacheMode("")).To(Equal(ModelCacheModeShared)) diff --git a/internal/controller/model_storage.go b/internal/controller/model_storage.go index 0f59e195..c887c4d4 100644 --- a/internal/controller/model_storage.go +++ b/internal/controller/model_storage.go @@ -70,12 +70,28 @@ func resolveCacheMode(mode string) string { return ModelCacheModeShared } +// userModelCacheClaimName returns the user-supplied cache PVC name from +// spec.modelCache.claimName, or "" when the InferenceService does not override +// the operator-global cache mode. +func userModelCacheClaimName(isvc *inferencev1alpha1.InferenceService) string { + if isvc == nil || isvc.Spec.ModelCache == nil { + return "" + } + return isvc.Spec.ModelCache.ClaimName +} + // modelCachePVCName returns the name of the model cache PVC for the given mode. -// In shared mode (the default, and the resolution of an empty mode) this is the -// single cluster-wide PVC; in perService mode it is the per-InferenceService PVC -// "-model-cache". A nil isvc (unit tests that exercise the builder -// directly) falls back to the shared name. +// A per-InferenceService spec.modelCache.claimName override (#928) wins over +// the operator-global mode: that user-owned PVC becomes the cache volume for +// this workload only. Otherwise, in shared mode (the default, and the +// resolution of an empty mode) this is the single cluster-wide PVC; in +// perService mode it is the per-InferenceService PVC "-model-cache". A +// nil isvc (unit tests that exercise the builder directly) falls back to the +// shared name. func modelCachePVCName(isvc *inferencev1alpha1.InferenceService, mode string) string { + if claim := userModelCacheClaimName(isvc); claim != "" { + return claim + } if resolveCacheMode(mode) == ModelCacheModeShared || isvc == nil { return ModelCachePVCName } @@ -644,6 +660,30 @@ func buildEmptyDirStorageConfig(model *inferencev1alpha1.Model, isvc *inferencev func (r *InferenceServiceReconciler) ensureModelCachePVC(ctx context.Context, isvc *inferencev1alpha1.InferenceService) error { log := logf.FromContext(ctx) + // Bring-your-own cache PVC (#928): spec.modelCache.claimName names a + // user-owned claim, so the operator never creates, mutates, or deletes + // it — it only verifies the claim exists. A missing claim is surfaced as + // an error (-> Degraded condition + event) rather than silently falling + // back to the shared cache. + if claim := userModelCacheClaimName(isvc); claim != "" { + pvc := &corev1.PersistentVolumeClaim{} + err := r.Get(ctx, types.NamespacedName{Name: claim, Namespace: isvc.Namespace}, pvc) + if err == nil { + return nil + } + if apierrors.IsNotFound(err) { + if r.Recorder != nil { + r.Recorder.Eventf(isvc, nil, corev1.EventTypeWarning, "ModelCachePVCNotFound", "Reconcile", + "spec.modelCache.claimName %q does not exist in namespace %q; create the PVC or remove the field", + claim, isvc.Namespace) + } + return fmt.Errorf( + "model cache PVC %q (spec.modelCache.claimName) not found in namespace %q: the claim is user-owned and must be created before use", + claim, isvc.Namespace) + } + return fmt.Errorf("failed to check user model cache PVC %q: %w", claim, err) + } + shared := resolveCacheMode(r.ModelCacheMode) == ModelCacheModeShared namespace := isvc.Namespace pvcName := modelCachePVCName(isvc, r.ModelCacheMode)