defilantech · Defilan · Jul 3, 2026 · Jul 3, 2026
@@ -80,6 +80,27 @@ type SpeculativeDecodingSpec struct {
 	NDraftMax *int32 `json:"nDraftMax,omitempty"`
 }
 
+// ModelCacheSpec points this InferenceService's model cache at a user-managed
+// PVC instead of the operator's shared/perService cache PVC. The operator
+// mounts and populates the claim through the same prep + download init
+// containers as the built-in cache, but never creates, mutates, or deletes it;
+// the user owns the PVC end-to-end.
+type ModelCacheSpec struct {
+	// ClaimName names a pre-existing PersistentVolumeClaim in the
+	// InferenceService's namespace to use as the writable model cache volume.
+	// Weights land under the usual <cacheKey>/ subdirectory of the claim, so
+	// RefreshPolicy and cache-key semantics are unchanged and multiple models
+	// can share one claim without colliding. The claim must already exist:
+	// when it is missing the InferenceService is marked Degraded rather than
+	// silently falling back to the shared cache. Ignored for pvc:// model
+	// sources (already staged, read-only, no download). Node alignment of
+	// RWO/local claims (via nodeSelector) is the user's responsibility.
+	// +kubebuilder:validation:MinLength=1
+	// +kubebuilder:validation:MaxLength=253
+	// +optional
+	ClaimName string `json:"claimName,omitempty"`
+}
+
 type InferenceServiceSpec struct {
 	// ModelRef references the Model CR that contains the model to serve
 	// +kubebuilder:validation:Required
@@ -403,6 +424,14 @@ type InferenceServiceSpec struct {
 	// +optional
 	SkipModelInit *bool `json:"skipModelInit,omitempty"`
 
+	// ModelCache overrides where this InferenceService caches model weights:
+	// when claimName is set, the named user-owned PVC is mounted as the
+	// writable model cache (prep + download init containers run against it)
+	// instead of the operator's shared/perService cache PVC. When unset, the
+	// operator-global cache mode applies unchanged.
+	// +optional
+	ModelCache *ModelCacheSpec `json:"modelCache,omitempty"`
+
 	// PersonaPlexConfig holds configuration for the PersonaPlex (Moshi) runtime.
 	// Only used when Runtime is "personaplex".
 	// +optional

@@ -1490,6 +1490,29 @@ spec:
                 - embedding
                 - rerank
                 type: string
+              modelCache:
+                description: |-
+                  ModelCache overrides where this InferenceService caches model weights:
+                  when claimName is set, the named user-owned PVC is mounted as the
+                  writable model cache (prep + download init containers run against it)
+                  instead of the operator's shared/perService cache PVC. When unset, the
+                  operator-global cache mode applies unchanged.
+                properties:
+                  claimName:
+                    description: |-
+                      ClaimName names a pre-existing PersistentVolumeClaim in the
+                      InferenceService's namespace to use as the writable model cache volume.
+                      Weights land under the usual <cacheKey>/ subdirectory of the claim, so
+                      RefreshPolicy and cache-key semantics are unchanged and multiple models
+                      can share one claim without colliding. The claim must already exist:
+                      when it is missing the InferenceService is marked Degraded rather than
+                      silently falling back to the shared cache. Ignored for pvc:// model
+                      sources (already staged, read-only, no download). Node alignment of
+                      RWO/local claims (via nodeSelector) is the user's responsibility.
+                    maxLength: 253
+                    minLength: 1
+                    type: string
+                type: object
               modelRef:
                 description: ModelRef references the Model CR that contains the model
                   to serve

@@ -1486,6 +1486,29 @@ spec:
                 - embedding
                 - rerank
                 type: string
+              modelCache:
+                description: |-
+                  ModelCache overrides where this InferenceService caches model weights:
+                  when claimName is set, the named user-owned PVC is mounted as the
+                  writable model cache (prep + download init containers run against it)
+                  instead of the operator's shared/perService cache PVC. When unset, the
+                  operator-global cache mode applies unchanged.
+                properties:
+                  claimName:
+                    description: |-
+                      ClaimName names a pre-existing PersistentVolumeClaim in the
+                      InferenceService's namespace to use as the writable model cache volume.
+                      Weights land under the usual <cacheKey>/ subdirectory of the claim, so
+                      RefreshPolicy and cache-key semantics are unchanged and multiple models
+                      can share one claim without colliding. The claim must already exist:
+                      when it is missing the InferenceService is marked Degraded rather than
+                      silently falling back to the shared cache. Ignored for pvc:// model
+                      sources (already staged, read-only, no download). Node alignment of
+                      RWO/local claims (via nodeSelector) is the user's responsibility.
+                    maxLength: 253
+                    minLength: 1
+                    type: string
+                type: object
               modelRef:
                 description: ModelRef references the Model CR that contains the model
                   to serve

@@ -145,6 +145,46 @@ modelCache:
   accessMode: ReadWriteMany
 ```
 
+### Per-InferenceService Cache PVC (Bring Your Own)
+
+The cache backend above is an operator-global choice. To point a *single*
+InferenceService at its own pre-existing, user-owned PVC — for example a
+node-local volume for a large model pinned to one node, while everything else
+rides the shared cache — set `spec.modelCache.claimName`:
+
+```yaml
+apiVersion: inference.llmkube.dev/v1alpha1
+kind: InferenceService
+metadata:
+  name: llama-3.1-70b
+spec:
+  modelRef: llama-3.1-70b
+  modelCache:
+    claimName: my-model-cache   # pre-existing PVC in the same namespace
+```
+
+Behavior:
+
+- The named PVC becomes the writable model cache for this workload only: the
+  same `model-cache-prep` and `model-downloader` init containers run against
+  it, weights land under the usual `<cacheKey>/` subdirectory, and the serving
+  container mounts it read-only. `RefreshPolicy` and cache-key semantics are
+  unchanged, so multiple models can safely share one claim.
+- The operator **never creates or deletes** the claim — you own it end-to-end
+  (unlike `perService` mode, where the operator provisions and
+  garbage-collects `<isvc>-model-cache`). If the claim does not exist, the
+  InferenceService is marked `Degraded` with a `ModelCachePVCNotFound` event
+  instead of silently falling back to the shared cache.
+- `claimName` targets the download path, so it is ignored for pre-staged
+  `pvc://` model sources (mounted read-only, no download); a warning event is
+  emitted if both are set.
+- Node alignment is your responsibility: for an RWO or node-local claim, use
+  `nodeSelector` so the pod lands where the PVC binds (a
+  `WaitForFirstConsumer` local class binds on the first consumer; a pre-bound
+  RWO PVC pins the pod).
+- `llmkube cache list` / `cache clear` inspect the shared cache only; they do
+  not see bring-your-own cache PVCs.
+
 ## CLI Commands
 
 ### List Cached Models

@@ -18,6 +18,7 @@ package controller
 
 import (
 	"context"
+	"fmt"
 	"net/http"
 	"strings"
 	"time"
@@ -167,10 +168,21 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
 	if effectiveModelCacheKey(model) != "" && r.ModelCachePath != "" {
 		if err := r.ensureModelCachePVC(ctx, inferenceService); err != nil {
 			log.Error(err, "Failed to ensure model cache PVC exists", "namespace", inferenceService.Namespace)
-			return r.updateStatusWithSchedulingInfo(ctx, inferenceService, PhaseFailed, modelReady, 0, desiredReplicas, "", "Failed to create model cache PVC", nil)
+			return r.updateStatusWithSchedulingInfo(ctx, inferenceService, PhaseFailed, modelReady, 0, desiredReplicas, "",
+				fmt.Sprintf("Failed to ensure model cache PVC: %v", err), nil)
 		}
 	}
 
+	// spec.modelCache.claimName targets the download path, so it is meaningless
+	// for a pre-staged pvc:// source (mounted read-only, no download). The
+	// claimName is ignored in that case; surface a Warning so the conflict is
+	// visible instead of silently dropped.
+	if r.Recorder != nil && userModelCacheClaimName(inferenceService) != "" && isPVCSource(model.Spec.Source) {
+		r.Recorder.Eventf(inferenceService, nil, corev1.EventTypeWarning, "ModelCacheClaimIgnored", "Reconcile",
+			"spec.modelCache.claimName is ignored: model source %q is a pre-staged pvc:// volume (read-only, no download)",
+			model.Spec.Source)
+	}
+
 	isMetal := isMetalModel(model)
 
 	if r.Recorder != nil && needsOffloadMemoryWarning(inferenceService) {

@@ -23,6 +23,7 @@ import (
 	. "github.com/onsi/gomega"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/types"
 
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -767,6 +768,166 @@ var _ = Describe("buildCachedStorageConfig cache mode selection (#728)", func()
 	})
 })
 
+var _ = Describe("buildCachedStorageConfig user claimName override (#928)", func() {
+	model := &inferencev1alpha1.Model{
+		Spec:   inferencev1alpha1.ModelSpec{Source: "https://example.com/model.gguf"},
+		Status: inferencev1alpha1.ModelStatus{CacheKey: "abc123def456"},
+	}
+	isvcWithClaim := func() *inferencev1alpha1.InferenceService {
+		return &inferencev1alpha1.InferenceService{
+			ObjectMeta: metav1.ObjectMeta{Name: "byo-isvc"},
+			Spec: inferencev1alpha1.InferenceServiceSpec{
+				ModelCache: &inferencev1alpha1.ModelCacheSpec{ClaimName: "my-model-cache"},
+			},
+		}
+	}
+
+	It("mounts the user PVC instead of the shared PVC in shared mode", func() {
+		config := buildCachedStorageConfig(model, isvcWithClaim(), ModelCacheModeShared, "", "curl:8.18.0", 102)
+		Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache"))
+	})
+
+	It("mounts the user PVC instead of the per-isvc PVC in perService mode", func() {
+		config := buildCachedStorageConfig(model, isvcWithClaim(), ModelCacheModePerService, "", "curl:8.18.0", 102)
+		Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache"))
+	})
+
+	It("keeps the cache layout and init containers identical to the built-in cache path", func() {
+		config := buildCachedStorageConfig(model, isvcWithClaim(), "", "", "curl:8.18.0", 102)
+
+		// Weights still land under <cacheKey>/, not the PVC root.
+		Expect(config.modelPath).To(Equal("/models/abc123def456/model.gguf"))
+		// Same prep + downloader init containers, mounted read-write.
+		Expect(config.initContainers).To(HaveLen(2))
+		Expect(config.initContainers[0].Name).To(Equal("model-cache-prep"))
+		Expect(config.initContainers[1].Name).To(Equal("model-downloader"))
+		Expect(config.initContainers[1].VolumeMounts[0].ReadOnly).To(BeFalse())
+		// The main container mounts the user PVC read-only.
+		Expect(config.volumeMounts[0].MountPath).To(Equal("/models"))
+		Expect(config.volumeMounts[0].ReadOnly).To(BeTrue())
+	})
+
+	It("uses the user PVC for multi-file staged models too", func() {
+		staged := &inferencev1alpha1.Model{
+			ObjectMeta: metav1.ObjectMeta{Name: "staged", Namespace: "default"},
+			Spec: inferencev1alpha1.ModelSpec{
+				Source: "hf://org/repo-GGUF",
+				Files:  []string{"model-Q4_K_M.gguf"},
+			},
+		}
+		config := buildCachedStorageConfig(staged, isvcWithClaim(), "", "", "curl:8.18.0", 102)
+		Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache"))
+	})
+
+	It("does not affect an InferenceService without modelCache (shared PVC as before)", func() {
+		isvc := &inferencev1alpha1.InferenceService{ObjectMeta: metav1.ObjectMeta{Name: "plain-isvc"}}
+		config := buildCachedStorageConfig(model, isvc, ModelCacheModeShared, "", "curl:8.18.0", 102)
+		Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal(ModelCachePVCName))
+	})
+})
+
+var _ = Describe("ensureModelCachePVC (user claimName, #928)", func() {
+	var reconciler *InferenceServiceReconciler
+	var isvc *inferencev1alpha1.InferenceService
+	const userClaim = "byo-model-cache"
+
+	forceDeletePVC := func(name string) {
+		ctx := context.Background()
+		pvc := &corev1.PersistentVolumeClaim{}
+		key := types.NamespacedName{Name: name, Namespace: "default"}
+		if err := k8sClient.Get(ctx, key, pvc); err != nil {
+			return
+		}
+		if len(pvc.Finalizers) > 0 {
+			pvc.Finalizers = nil
+			_ = k8sClient.Update(ctx, pvc)
+		}
+		_ = k8sClient.Delete(ctx, pvc)
+		Eventually(func() bool {
+			return errors.IsNotFound(k8sClient.Get(ctx, key, &corev1.PersistentVolumeClaim{}))
+		}, "5s", "100ms").Should(BeTrue())
+	}
+
+	createUserPVC := func() {
+		pvc := &corev1.PersistentVolumeClaim{
+			ObjectMeta: metav1.ObjectMeta{Name: userClaim, Namespace: "default"},
+			Spec: corev1.PersistentVolumeClaimSpec{
+				AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
+				Resources: corev1.VolumeResourceRequirements{
+					Requests: corev1.ResourceList{corev1.ResourceStorage: resource.MustParse("10Gi")},
+				},
+			},
+		}
+		Expect(k8sClient.Create(context.Background(), pvc)).To(Succeed())
+	}
+
+	BeforeEach(func() {
+		forceDeletePVC(ModelCachePVCName)
+		forceDeletePVC(userClaim)
+		reconciler = &InferenceServiceReconciler{
+			Client:         k8sClient,
+			Scheme:         k8sClient.Scheme(),
+			ModelCacheMode: ModelCacheModeShared,
+		}
+		isvc = &inferencev1alpha1.InferenceService{
+			ObjectMeta: metav1.ObjectMeta{Name: "byo-cache-isvc", Namespace: "default"},
+			Spec: inferencev1alpha1.InferenceServiceSpec{
+				ModelRef:   "some-model",
+				ModelCache: &inferencev1alpha1.ModelCacheSpec{ClaimName: userClaim},
+			},
+		}
+	})
+
+	AfterEach(func() {
+		forceDeletePVC(ModelCachePVCName)
+		forceDeletePVC(userClaim)
+	})
+
+	It("succeeds without creating any operator PVC when the user PVC exists", func() {
+		createUserPVC()
+		Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed())
+
+		// Neither the shared nor a per-isvc cache PVC is created.
+		shared := &corev1.PersistentVolumeClaim{}
+		err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ModelCachePVCName, Namespace: "default"}, shared)
+		Expect(errors.IsNotFound(err)).To(BeTrue())
+		perISVC := &corev1.PersistentVolumeClaim{}
+		err = k8sClient.Get(context.Background(), types.NamespacedName{Name: isvc.Name + "-model-cache", Namespace: "default"}, perISVC)
+		Expect(errors.IsNotFound(err)).To(BeTrue())
+	})
+
+	It("never adopts or mutates the user PVC (no owner refs, no operator labels)", func() {
+		createUserPVC()
+		Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed())
+
+		pvc := &corev1.PersistentVolumeClaim{}
+		Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: userClaim, Namespace: "default"}, pvc)).To(Succeed())
+		Expect(pvc.OwnerReferences).To(BeEmpty())
+		Expect(pvc.Labels).NotTo(HaveKey("app.kubernetes.io/managed-by"))
+	})
+
+	It("does not create the user PVC and errors clearly when it is missing", func() {
+		err := reconciler.ensureModelCachePVC(context.Background(), isvc)
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring(userClaim))
+		Expect(err.Error()).To(ContainSubstring("spec.modelCache.claimName"))
+
+		pvc := &corev1.PersistentVolumeClaim{}
+		getErr := k8sClient.Get(context.Background(), types.NamespacedName{Name: userClaim, Namespace: "default"}, pvc)
+		Expect(errors.IsNotFound(getErr)).To(BeTrue())
+	})
+
+	It("overrides perService mode as well (no <isvc>-model-cache created)", func() {
+		reconciler.ModelCacheMode = ModelCacheModePerService
+		createUserPVC()
+		Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed())
+
+		perISVC := &corev1.PersistentVolumeClaim{}
+		err := k8sClient.Get(context.Background(), types.NamespacedName{Name: isvc.Name + "-model-cache", Namespace: "default"}, perISVC)
+		Expect(errors.IsNotFound(err)).To(BeTrue())
+	})
+})
+
 var _ = Describe("resolveCacheMode", func() {
 	It("maps an empty mode to the shared default", func() {
 		Expect(resolveCacheMode("")).To(Equal(ModelCacheModeShared))