Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions api/v1alpha1/inferenceservice_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,27 @@ type SpeculativeDecodingSpec struct {
NDraftMax *int32 `json:"nDraftMax,omitempty"`
}

// ModelCacheSpec points this InferenceService's model cache at a user-managed
// PVC instead of the operator's shared/perService cache PVC. The operator
// mounts and populates the claim through the same prep + download init
// containers as the built-in cache, but never creates, mutates, or deletes it;
// the user owns the PVC end-to-end.
type ModelCacheSpec struct {
// ClaimName names a pre-existing PersistentVolumeClaim in the
// InferenceService's namespace to use as the writable model cache volume.
// Weights land under the usual <cacheKey>/ subdirectory of the claim, so
// RefreshPolicy and cache-key semantics are unchanged and multiple models
// can share one claim without colliding. The claim must already exist:
// when it is missing the InferenceService is marked Degraded rather than
// silently falling back to the shared cache. Ignored for pvc:// model
// sources (already staged, read-only, no download). Node alignment of
// RWO/local claims (via nodeSelector) is the user's responsibility.
// +kubebuilder:validation:MinLength=1
// +kubebuilder:validation:MaxLength=253
// +optional
ClaimName string `json:"claimName,omitempty"`
}

type InferenceServiceSpec struct {
// ModelRef references the Model CR that contains the model to serve
// +kubebuilder:validation:Required
Expand Down Expand Up @@ -403,6 +424,14 @@ type InferenceServiceSpec struct {
// +optional
SkipModelInit *bool `json:"skipModelInit,omitempty"`

// ModelCache overrides where this InferenceService caches model weights:
// when claimName is set, the named user-owned PVC is mounted as the
// writable model cache (prep + download init containers run against it)
// instead of the operator's shared/perService cache PVC. When unset, the
// operator-global cache mode applies unchanged.
// +optional
ModelCache *ModelCacheSpec `json:"modelCache,omitempty"`

// PersonaPlexConfig holds configuration for the PersonaPlex (Moshi) runtime.
// Only used when Runtime is "personaplex".
// +optional
Expand Down
20 changes: 20 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions charts/llmkube/templates/crds/inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1490,6 +1490,29 @@ spec:
- embedding
- rerank
type: string
modelCache:
description: |-
ModelCache overrides where this InferenceService caches model weights:
when claimName is set, the named user-owned PVC is mounted as the
writable model cache (prep + download init containers run against it)
instead of the operator's shared/perService cache PVC. When unset, the
operator-global cache mode applies unchanged.
properties:
claimName:
description: |-
ClaimName names a pre-existing PersistentVolumeClaim in the
InferenceService's namespace to use as the writable model cache volume.
Weights land under the usual <cacheKey>/ subdirectory of the claim, so
RefreshPolicy and cache-key semantics are unchanged and multiple models
can share one claim without colliding. The claim must already exist:
when it is missing the InferenceService is marked Degraded rather than
silently falling back to the shared cache. Ignored for pvc:// model
sources (already staged, read-only, no download). Node alignment of
RWO/local claims (via nodeSelector) is the user's responsibility.
maxLength: 253
minLength: 1
type: string
type: object
modelRef:
description: ModelRef references the Model CR that contains the model
to serve
Expand Down
23 changes: 23 additions & 0 deletions config/crd/bases/inference.llmkube.dev_inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1486,6 +1486,29 @@ spec:
- embedding
- rerank
type: string
modelCache:
description: |-
ModelCache overrides where this InferenceService caches model weights:
when claimName is set, the named user-owned PVC is mounted as the
writable model cache (prep + download init containers run against it)
instead of the operator's shared/perService cache PVC. When unset, the
operator-global cache mode applies unchanged.
properties:
claimName:
description: |-
ClaimName names a pre-existing PersistentVolumeClaim in the
InferenceService's namespace to use as the writable model cache volume.
Weights land under the usual <cacheKey>/ subdirectory of the claim, so
RefreshPolicy and cache-key semantics are unchanged and multiple models
can share one claim without colliding. The claim must already exist:
when it is missing the InferenceService is marked Degraded rather than
silently falling back to the shared cache. Ignored for pvc:// model
sources (already staged, read-only, no download). Node alignment of
RWO/local claims (via nodeSelector) is the user's responsibility.
maxLength: 253
minLength: 1
type: string
type: object
modelRef:
description: ModelRef references the Model CR that contains the model
to serve
Expand Down
40 changes: 40 additions & 0 deletions docs/MODEL-CACHE.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,46 @@ modelCache:
accessMode: ReadWriteMany
```

### Per-InferenceService Cache PVC (Bring Your Own)

The cache backend above is an operator-global choice. To point a *single*
InferenceService at its own pre-existing, user-owned PVC — for example a
node-local volume for a large model pinned to one node, while everything else
rides the shared cache — set `spec.modelCache.claimName`:

```yaml
apiVersion: inference.llmkube.dev/v1alpha1
kind: InferenceService
metadata:
name: llama-3.1-70b
spec:
modelRef: llama-3.1-70b
modelCache:
claimName: my-model-cache # pre-existing PVC in the same namespace
```

Behavior:

- The named PVC becomes the writable model cache for this workload only: the
same `model-cache-prep` and `model-downloader` init containers run against
it, weights land under the usual `<cacheKey>/` subdirectory, and the serving
container mounts it read-only. `RefreshPolicy` and cache-key semantics are
unchanged, so multiple models can safely share one claim.
- The operator **never creates or deletes** the claim — you own it end-to-end
(unlike `perService` mode, where the operator provisions and
garbage-collects `<isvc>-model-cache`). If the claim does not exist, the
InferenceService is marked `Degraded` with a `ModelCachePVCNotFound` event
instead of silently falling back to the shared cache.
- `claimName` targets the download path, so it is ignored for pre-staged
`pvc://` model sources (mounted read-only, no download); a warning event is
emitted if both are set.
- Node alignment is your responsibility: for an RWO or node-local claim, use
`nodeSelector` so the pod lands where the PVC binds (a
`WaitForFirstConsumer` local class binds on the first consumer; a pre-bound
RWO PVC pins the pod).
- `llmkube cache list` / `cache clear` inspect the shared cache only; they do
not see bring-your-own cache PVCs.

## CLI Commands

### List Cached Models
Expand Down
14 changes: 13 additions & 1 deletion internal/controller/inferenceservice_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package controller

import (
"context"
"fmt"
"net/http"
"strings"
"time"
Expand Down Expand Up @@ -167,10 +168,21 @@ func (r *InferenceServiceReconciler) Reconcile(ctx context.Context, req ctrl.Req
if effectiveModelCacheKey(model) != "" && r.ModelCachePath != "" {
if err := r.ensureModelCachePVC(ctx, inferenceService); err != nil {
log.Error(err, "Failed to ensure model cache PVC exists", "namespace", inferenceService.Namespace)
return r.updateStatusWithSchedulingInfo(ctx, inferenceService, PhaseFailed, modelReady, 0, desiredReplicas, "", "Failed to create model cache PVC", nil)
return r.updateStatusWithSchedulingInfo(ctx, inferenceService, PhaseFailed, modelReady, 0, desiredReplicas, "",
fmt.Sprintf("Failed to ensure model cache PVC: %v", err), nil)
}
}

// spec.modelCache.claimName targets the download path, so it is meaningless
// for a pre-staged pvc:// source (mounted read-only, no download). The
// claimName is ignored in that case; surface a Warning so the conflict is
// visible instead of silently dropped.
if r.Recorder != nil && userModelCacheClaimName(inferenceService) != "" && isPVCSource(model.Spec.Source) {
r.Recorder.Eventf(inferenceService, nil, corev1.EventTypeWarning, "ModelCacheClaimIgnored", "Reconcile",
"spec.modelCache.claimName is ignored: model source %q is a pre-staged pvc:// volume (read-only, no download)",
model.Spec.Source)
}

isMetal := isMetalModel(model)

if r.Recorder != nil && needsOffloadMemoryWarning(inferenceService) {
Expand Down
161 changes: 161 additions & 0 deletions internal/controller/inferenceservice_storage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -767,6 +768,166 @@ var _ = Describe("buildCachedStorageConfig cache mode selection (#728)", func()
})
})

var _ = Describe("buildCachedStorageConfig user claimName override (#928)", func() {
model := &inferencev1alpha1.Model{
Spec: inferencev1alpha1.ModelSpec{Source: "https://example.com/model.gguf"},
Status: inferencev1alpha1.ModelStatus{CacheKey: "abc123def456"},
}
isvcWithClaim := func() *inferencev1alpha1.InferenceService {
return &inferencev1alpha1.InferenceService{
ObjectMeta: metav1.ObjectMeta{Name: "byo-isvc"},
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelCache: &inferencev1alpha1.ModelCacheSpec{ClaimName: "my-model-cache"},
},
}
}

It("mounts the user PVC instead of the shared PVC in shared mode", func() {
config := buildCachedStorageConfig(model, isvcWithClaim(), ModelCacheModeShared, "", "curl:8.18.0", 102)
Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache"))
})

It("mounts the user PVC instead of the per-isvc PVC in perService mode", func() {
config := buildCachedStorageConfig(model, isvcWithClaim(), ModelCacheModePerService, "", "curl:8.18.0", 102)
Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache"))
})

It("keeps the cache layout and init containers identical to the built-in cache path", func() {
config := buildCachedStorageConfig(model, isvcWithClaim(), "", "", "curl:8.18.0", 102)

// Weights still land under <cacheKey>/, not the PVC root.
Expect(config.modelPath).To(Equal("/models/abc123def456/model.gguf"))
// Same prep + downloader init containers, mounted read-write.
Expect(config.initContainers).To(HaveLen(2))
Expect(config.initContainers[0].Name).To(Equal("model-cache-prep"))
Expect(config.initContainers[1].Name).To(Equal("model-downloader"))
Expect(config.initContainers[1].VolumeMounts[0].ReadOnly).To(BeFalse())
// The main container mounts the user PVC read-only.
Expect(config.volumeMounts[0].MountPath).To(Equal("/models"))
Expect(config.volumeMounts[0].ReadOnly).To(BeTrue())
})

It("uses the user PVC for multi-file staged models too", func() {
staged := &inferencev1alpha1.Model{
ObjectMeta: metav1.ObjectMeta{Name: "staged", Namespace: "default"},
Spec: inferencev1alpha1.ModelSpec{
Source: "hf://org/repo-GGUF",
Files: []string{"model-Q4_K_M.gguf"},
},
}
config := buildCachedStorageConfig(staged, isvcWithClaim(), "", "", "curl:8.18.0", 102)
Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal("my-model-cache"))
})

It("does not affect an InferenceService without modelCache (shared PVC as before)", func() {
isvc := &inferencev1alpha1.InferenceService{ObjectMeta: metav1.ObjectMeta{Name: "plain-isvc"}}
config := buildCachedStorageConfig(model, isvc, ModelCacheModeShared, "", "curl:8.18.0", 102)
Expect(config.volumes[0].PersistentVolumeClaim.ClaimName).To(Equal(ModelCachePVCName))
})
})

var _ = Describe("ensureModelCachePVC (user claimName, #928)", func() {
var reconciler *InferenceServiceReconciler
var isvc *inferencev1alpha1.InferenceService
const userClaim = "byo-model-cache"

forceDeletePVC := func(name string) {
ctx := context.Background()
pvc := &corev1.PersistentVolumeClaim{}
key := types.NamespacedName{Name: name, Namespace: "default"}
if err := k8sClient.Get(ctx, key, pvc); err != nil {
return
}
if len(pvc.Finalizers) > 0 {
pvc.Finalizers = nil
_ = k8sClient.Update(ctx, pvc)
}
_ = k8sClient.Delete(ctx, pvc)
Eventually(func() bool {
return errors.IsNotFound(k8sClient.Get(ctx, key, &corev1.PersistentVolumeClaim{}))
}, "5s", "100ms").Should(BeTrue())
}

createUserPVC := func() {
pvc := &corev1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{Name: userClaim, Namespace: "default"},
Spec: corev1.PersistentVolumeClaimSpec{
AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
Resources: corev1.VolumeResourceRequirements{
Requests: corev1.ResourceList{corev1.ResourceStorage: resource.MustParse("10Gi")},
},
},
}
Expect(k8sClient.Create(context.Background(), pvc)).To(Succeed())
}

BeforeEach(func() {
forceDeletePVC(ModelCachePVCName)
forceDeletePVC(userClaim)
reconciler = &InferenceServiceReconciler{
Client: k8sClient,
Scheme: k8sClient.Scheme(),
ModelCacheMode: ModelCacheModeShared,
}
isvc = &inferencev1alpha1.InferenceService{
ObjectMeta: metav1.ObjectMeta{Name: "byo-cache-isvc", Namespace: "default"},
Spec: inferencev1alpha1.InferenceServiceSpec{
ModelRef: "some-model",
ModelCache: &inferencev1alpha1.ModelCacheSpec{ClaimName: userClaim},
},
}
})

AfterEach(func() {
forceDeletePVC(ModelCachePVCName)
forceDeletePVC(userClaim)
})

It("succeeds without creating any operator PVC when the user PVC exists", func() {
createUserPVC()
Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed())

// Neither the shared nor a per-isvc cache PVC is created.
shared := &corev1.PersistentVolumeClaim{}
err := k8sClient.Get(context.Background(), types.NamespacedName{Name: ModelCachePVCName, Namespace: "default"}, shared)
Expect(errors.IsNotFound(err)).To(BeTrue())
perISVC := &corev1.PersistentVolumeClaim{}
err = k8sClient.Get(context.Background(), types.NamespacedName{Name: isvc.Name + "-model-cache", Namespace: "default"}, perISVC)
Expect(errors.IsNotFound(err)).To(BeTrue())
})

It("never adopts or mutates the user PVC (no owner refs, no operator labels)", func() {
createUserPVC()
Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed())

pvc := &corev1.PersistentVolumeClaim{}
Expect(k8sClient.Get(context.Background(), types.NamespacedName{Name: userClaim, Namespace: "default"}, pvc)).To(Succeed())
Expect(pvc.OwnerReferences).To(BeEmpty())
Expect(pvc.Labels).NotTo(HaveKey("app.kubernetes.io/managed-by"))
})

It("does not create the user PVC and errors clearly when it is missing", func() {
err := reconciler.ensureModelCachePVC(context.Background(), isvc)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring(userClaim))
Expect(err.Error()).To(ContainSubstring("spec.modelCache.claimName"))

pvc := &corev1.PersistentVolumeClaim{}
getErr := k8sClient.Get(context.Background(), types.NamespacedName{Name: userClaim, Namespace: "default"}, pvc)
Expect(errors.IsNotFound(getErr)).To(BeTrue())
})

It("overrides perService mode as well (no <isvc>-model-cache created)", func() {
reconciler.ModelCacheMode = ModelCacheModePerService
createUserPVC()
Expect(reconciler.ensureModelCachePVC(context.Background(), isvc)).To(Succeed())

perISVC := &corev1.PersistentVolumeClaim{}
err := k8sClient.Get(context.Background(), types.NamespacedName{Name: isvc.Name + "-model-cache", Namespace: "default"}, perISVC)
Expect(errors.IsNotFound(err)).To(BeTrue())
})
})

var _ = Describe("resolveCacheMode", func() {
It("maps an empty mode to the shared default", func() {
Expect(resolveCacheMode("")).To(Equal(ModelCacheModeShared))
Expand Down
Loading