From 8de6bb8fe8490bbd42ddc7239b3023a4a4abdd5f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 15 May 2026 05:31:48 +0000
Subject: [PATCH 1/3] Initial plan


From 57922ba2af50fd158a16402c3c9597c2ea7a412c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 15 May 2026 05:42:15 +0000
Subject: [PATCH 2/3] refactor: unify multiprocess context registries

Agent-Logs-Url: https://github.com/hlin99/LMCache/sessions/1778bac4-9474-4391-9ed4-611344f5567f

Co-authored-by: hlin99 <73271530+hlin99@users.noreply.github.com>
---
 lmcache/v1/multiprocess/blend_server_v2.py    |  88 +++++-----
 .../v1/multiprocess/http_apis/cache_api.py    |   8 +
 lmcache/v1/multiprocess/server.py             | 163 +++++++++++-------
 tests/v1/multiprocess/test_http_server.py     |   9 +
 4 files changed, 163 insertions(+), 105 deletions(-)

diff --git a/lmcache/v1/multiprocess/blend_server_v2.py b/lmcache/v1/multiprocess/blend_server_v2.py
index 957562272e7..803a83930e7 100644
--- a/lmcache/v1/multiprocess/blend_server_v2.py
+++ b/lmcache/v1/multiprocess/blend_server_v2.py
@@ -38,6 +38,7 @@
 """
 
 # Standard
+from dataclasses import dataclass
 from typing import Any
 import threading
 import time
@@ -101,6 +102,15 @@
 logger = init_logger(__name__)
 
 
+@dataclass
+class _CBRegisteredContext:
+    """Registered CB GPU context metadata."""
+
+    model_name: str
+    world_size: int
+    gpu_context: PlainGPUCacheContext
+
+
 class BlendTokenRangeMatcher:
     """Fast token-range matcher using polynomial rolling/chunk hashes and a
     direct-address lookup table.
@@ -390,11 +400,9 @@ def __init__(
             storage_manager_config, chunk_size, hash_algorithm=hash_algorithm
         )
 
-        self._cb_gpu_contexts: dict[int, PlainGPUCacheContext] = {}
-
-        # CB GPU ID -> (model name, world size) as metadata
+        # CB instance ID -> registered context
         # NOTE: This is mainly for determining the layout desc during prefetch
-        self._cb_gpu_context_meta: dict[int, tuple[str, int]] = {}
+        self._cb_contexts: dict[int, _CBRegisteredContext] = {}
 
         # Fast local matcher: indexes pre-computed chunk hashes for sub-sequence lookup
         self._token_range_matcher = BlendTokenRangeMatcher(chunk_size)
@@ -418,8 +426,11 @@ def cb_register_kv_cache(
             world_size: The world size associated with this KV cache.
         """
         gpu_context = PlainGPUCacheContext(kv_caches, self.chunk_size)
-        self._cb_gpu_contexts[instance_id] = gpu_context
-        self._cb_gpu_context_meta[instance_id] = (model_name, world_size)
+        self._cb_contexts[instance_id] = _CBRegisteredContext(
+            model_name=model_name,
+            world_size=world_size,
+            gpu_context=gpu_context,
+        )
         logger.info(
             "Registered CB KV cache for instance_id %d with %d layers",
             instance_id,
@@ -451,9 +462,8 @@ def cb_unregister_kv_cache(self, instance_id: int) -> None:
         Args:
             instance_id: Unique identifier for the blend engine instance to unregister
         """
-        if instance_id in self._cb_gpu_contexts:
-            del self._cb_gpu_contexts[instance_id]
-            del self._cb_gpu_context_meta[instance_id]
+        context = self._cb_contexts.pop(instance_id, None)
+        if context is not None:
             logger.info("Unregistered CB KV cache for instance_id %d", instance_id)
         else:
             logger.warning(
@@ -471,30 +481,28 @@ def report_status(self) -> dict:
         status = super().report_status()
 
         cb_gpu_context_meta: dict[str, dict] = {}
-        for gpu_id, meta in self._cb_gpu_context_meta.items():
-            model_name, world_size = meta
+        for instance_id, context in self._cb_contexts.items():
             entry: dict = {
-                "model_name": model_name,
-                "world_size": world_size,
+                "model_name": context.model_name,
+                "world_size": context.world_size,
             }
-            ctx = self._cb_gpu_contexts.get(gpu_id)
-            if ctx is not None:
-                # bytes per token = 2 (K+V) * num_layers * hidden_dim_size *
-                # itemsize; num_tokens is the cache capacity, not a per-token
-                # cost.
-                cache_size_per_token = (
-                    2 * ctx.num_layers * ctx.hidden_dim_size * ctx.dtype.itemsize
-                )
-                entry["kv_cache_layout"] = {
-                    "num_layers": ctx.num_layers,
-                    "num_tokens": ctx.num_tokens,
-                    "hidden_dim_size": ctx.hidden_dim_size,
-                    "dtype": str(ctx.dtype),
-                    "cache_size_per_token": cache_size_per_token,
-                }
-            cb_gpu_context_meta[str(gpu_id)] = entry
-
-        status["registered_cb_gpu_ids"] = list(self._cb_gpu_contexts.keys())
+            ctx = context.gpu_context
+            # bytes per token = 2 (K+V) * num_layers * hidden_dim_size *
+            # itemsize; num_tokens is the cache capacity, not a per-token
+            # cost.
+            cache_size_per_token = (
+                2 * ctx.num_layers * ctx.hidden_dim_size * ctx.dtype.itemsize
+            )
+            entry["kv_cache_layout"] = {
+                "num_layers": ctx.num_layers,
+                "num_tokens": ctx.num_tokens,
+                "hidden_dim_size": ctx.hidden_dim_size,
+                "dtype": str(ctx.dtype),
+                "cache_size_per_token": cache_size_per_token,
+            }
+            cb_gpu_context_meta[str(instance_id)] = entry
+
+        status["registered_cb_gpu_ids"] = list(self._cb_contexts.keys())
         status["cb_gpu_context_meta"] = cb_gpu_context_meta
         return status
 
@@ -593,9 +601,9 @@ def cb_lookup_pre_computed(self, key: IPCCacheEngineKey) -> list[CBMatchResult]:
 
         # Find the cb gpu context and calculate the layout desc
         layout_desc: MemoryLayoutDesc | None = None
-        for gpu_id, (m_name, w_size) in self._cb_gpu_context_meta.items():
-            if m_name == model_name and w_size == world_size:
-                cb_ctx = self._cb_gpu_contexts[gpu_id]
+        for context in self._cb_contexts.values():
+            if context.model_name == model_name and context.world_size == world_size:
+                cb_ctx = context.gpu_context
                 layout_desc = MemoryLayoutDesc(
                     shapes=[cb_ctx.get_kv_buffer_shape(self.chunk_size)],
                     dtypes=[cb_ctx.dtype],
@@ -845,10 +853,10 @@ def cb_store_pre_computed(
         """
         num_tokens = key.end - key.start
 
-        assert instance_id in self._cb_gpu_contexts, (
+        assert instance_id in self._cb_contexts, (
             f"Instance ID {instance_id} not registered for CB KV cache"
         )
-        gpu_context = self._cb_gpu_contexts[instance_id]
+        gpu_context = self._cb_contexts[instance_id].gpu_context
 
         # CPU-synchronous sentinel: GPU store is about to be enqueued.
         self._event_bus.publish(
@@ -968,10 +976,10 @@ def cb_retrieve_pre_computed(
         Note:
             We must call `cb_lookup_pre_computed` first before calling this function
         """
-        assert instance_id in self._cb_gpu_contexts, (
+        assert instance_id in self._cb_contexts, (
             f"Instance ID {instance_id} not registered for CB KV cache"
         )
-        gpu_context = self._cb_gpu_contexts[instance_id]
+        gpu_context = self._cb_contexts[instance_id].gpu_context
 
         # One obj_key per match_result, in cur_st order
         cb_match_result = sorted(cb_match_result, key=lambda r: r.cur_st)
@@ -1113,10 +1121,10 @@ def cb_store_final(
         num_tokens = key.end - key.start
 
         # Get GPU context
-        assert instance_id in self._cb_gpu_contexts, (
+        assert instance_id in self._cb_contexts, (
             f"Instance ID {instance_id} not registered for CB KV cache"
         )
-        gpu_context = self._cb_gpu_contexts[instance_id]
+        gpu_context = self._cb_contexts[instance_id].gpu_context
 
         # CPU-synchronous sentinels: SUBMITTED before SESSION_END so the
         # tracing subscriber's in-flight counter is non-zero when SESSION_END
diff --git a/lmcache/v1/multiprocess/http_apis/cache_api.py b/lmcache/v1/multiprocess/http_apis/cache_api.py
index a1ea5e961cf..25bddb995f0 100644
--- a/lmcache/v1/multiprocess/http_apis/cache_api.py
+++ b/lmcache/v1/multiprocess/http_apis/cache_api.py
@@ -118,6 +118,14 @@ async def kvcache_check(
         )
 
     gpu_ctxs = getattr(engine, "gpu_contexts", None)
+    if gpu_ctxs is None:
+        contexts = getattr(engine, "contexts", None)
+        if isinstance(contexts, dict):
+            gpu_ctxs = {
+                instance_id: context.gpu_context
+                for instance_id, context in contexts.items()
+                if context.gpu_context is not None
+            }
     if gpu_ctxs is None:
         return JSONResponse(
             status_code=501,
diff --git a/lmcache/v1/multiprocess/server.py b/lmcache/v1/multiprocess/server.py
index 07cb129bba5..d65c1f3d47d 100644
--- a/lmcache/v1/multiprocess/server.py
+++ b/lmcache/v1/multiprocess/server.py
@@ -176,6 +176,29 @@ class _PrefetchJob:
     cache_salt: str = ""
 
 
+@dataclass
+class RegisteredContext:
+    """Registry entry for a registered worker cache context."""
+
+    model_name: str
+    world_size: int
+    gpu_context: GPUCacheContext | None = None
+    non_cuda_metadata: NonGpuContextMetadata | None = None
+
+    @property
+    def is_gpu(self) -> bool:
+        """Whether this entry represents a GPU cache context."""
+        return self.gpu_context is not None
+
+    def get_layout_desc(self, chunk_size: int) -> MemoryLayoutDesc:
+        """Return memory layout descriptor for this context."""
+        if self.gpu_context is not None:
+            return get_layout_desc(self.gpu_context, chunk_size)
+        if self.non_cuda_metadata is not None:
+            return self.non_cuda_metadata.layout_desc
+        raise ValueError("RegisteredContext has neither gpu_context nor non_cuda")
+
+
 # Main class for the mp cache engine
 class MPCacheEngine:
     def __init__(
@@ -184,16 +207,11 @@ def __init__(
         chunk_size: int = 256,
         hash_algorithm: str = "blake3",
     ):
-        # GPU ID -> KV cache tensors
-        self.gpu_contexts: dict[int, GPUCacheContext] = {}
-
-        # GPU ID -> (model name, world size) as metadata
+        # Instance ID -> registered context
         # NOTE: This is mainly for determining the layout desc during prefetch
         # We assume that if the (model name, world size) is the same, then
-        # the layout desc returned by the gpu context is the same.
-        self.gpu_context_meta: dict[int, tuple[str, int]] = {}
-        self.non_cuda_contexts: dict[int, NonGpuContextMetadata] = {}
-        self.non_cuda_context_meta: dict[int, tuple[str, int]] = {}
+        # the layout desc returned by the context is the same.
+        self.contexts: dict[int, RegisteredContext] = {}
 
         # chunk size
         self.chunk_size = chunk_size
@@ -244,7 +262,7 @@ def register_kv_cache(
             layout_hints: See :class:`LayoutHints`.  Forwarded to
                 :class:`GPUCacheContext` for GPU KV format detection.
         """
-        if instance_id in self.gpu_contexts:
+        if instance_id in self.contexts:
             logger.warning(
                 "Instance %s's KV cache is already registered, "
                 "skipping the new registration",
@@ -258,8 +276,11 @@ def register_kv_cache(
             layout_hints=layout_hints or None,
             engine_type=engine_type,
         )
-        self.gpu_contexts[instance_id] = gpu_context
-        self.gpu_context_meta[instance_id] = (model_name, world_size)
+        self.contexts[instance_id] = RegisteredContext(
+            model_name=model_name,
+            world_size=world_size,
+            gpu_context=gpu_context,
+        )
         logger.info(
             "Registered KV cache for GPU ID %d with %d layers",
             instance_id,
@@ -273,17 +294,16 @@ def unregister_kv_cache(self, instance_id: int) -> None:
         Args:
             instance_id (int): The GPU instance ID (such as PID).
         """
-        if instance_id in self.gpu_contexts:
-            del self.gpu_contexts[instance_id]
-            del self.gpu_context_meta[instance_id]
+        context = self.contexts.pop(instance_id, None)
+        if context is None:
+            logger.warning("No KV cache found for GPU ID %d to unregister", instance_id)
+            return
+
+        if context.is_gpu:
             logger.info("Unregistered KV cache for GPU ID %d", instance_id)
             torch_dev.empty_cache()
-        elif instance_id in self.non_cuda_contexts:
-            del self.non_cuda_contexts[instance_id]
-            del self.non_cuda_context_meta[instance_id]
-            logger.info("Unregistered non-CUDA context for instance ID %d", instance_id)
         else:
-            logger.warning("No KV cache found for GPU ID %d to unregister", instance_id)
+            logger.info("Unregistered non-CUDA context for instance ID %d", instance_id)
 
     def register_kv_cache_non_gpu_context(
         self,
@@ -325,12 +345,15 @@ def register_kv_cache_non_gpu_context(
             else torch.Size([2, num_layers, self.chunk_size, hidden_dim_size])
         )
         layout_desc = MemoryLayoutDesc(shapes=[shape], dtypes=[dtype])
-        self.non_cuda_contexts[instance_id] = NonGpuContextMetadata(
-            layout_desc=layout_desc,
-            block_size=block_size,
-            use_mla=use_mla,
+        self.contexts[instance_id] = RegisteredContext(
+            model_name=model_name,
+            world_size=world_size,
+            non_cuda_metadata=NonGpuContextMetadata(
+                layout_desc=layout_desc,
+                block_size=block_size,
+                use_mla=use_mla,
+            ),
         )
-        self.non_cuda_context_meta[instance_id] = (model_name, world_size)
 
     def _resolve_obj_keys(self, key: IPCCacheEngineKey) -> list[ObjectKey]:
         """Resolve object keys from an IPC cache key.
@@ -375,14 +398,15 @@ def store_cpu_chunks(
         """
         obj_keys = self._resolve_obj_keys(key)
 
-        if instance_id not in self.non_cuda_contexts:
+        context = self.contexts.get(instance_id)
+        if context is None or context.non_cuda_metadata is None:
             raise ValueError(
                 f"non-CUDA context not registered for instance ID {instance_id}"
             )
-        ctx = self.non_cuda_contexts[instance_id]
+        non_cuda_metadata = context.non_cuda_metadata
         chunks: list[torch.Tensor] = pickle.loads(cpu_data)
         reserved_dict = self.storage_manager.reserve_write(
-            obj_keys, ctx.layout_desc, "new"
+            obj_keys, non_cuda_metadata.layout_desc, "new"
         )
         written_keys: list[ObjectKey] = []
         try:
@@ -426,7 +450,8 @@ def retrieve_cpu_chunks(
         """
         obj_keys = self._resolve_obj_keys(key)
 
-        if instance_id not in self.non_cuda_contexts:
+        context = self.contexts.get(instance_id)
+        if context is None or context.non_cuda_metadata is None:
             raise ValueError(
                 f"non-CUDA context not registered for instance ID {instance_id}"
             )
@@ -473,11 +498,16 @@ def store(
         st = time.perf_counter()
         obj_keys = self._resolve_obj_keys(key)
 
-        assert instance_id in self.gpu_contexts, (
+        assert instance_id in self.contexts, (
             f"KV cache not registered for GPU ID {instance_id}"
         )
-        gpu_context = self.gpu_contexts[instance_id]
-        model_name = self.gpu_context_meta[instance_id][0]
+        registered_context = self.contexts[instance_id]
+        gpu_context = registered_context.gpu_context
+        if gpu_context is None:
+            raise ValueError(
+                f"GPU KV cache not registered for instance ID {instance_id}"
+            )
+        model_name = registered_context.model_name
 
         # ``blocks_per_chunk`` is counted in inference-engine-side
         # blocks (each block addresses
@@ -656,11 +686,16 @@ def retrieve(
         st = time.perf_counter()
         obj_keys = self._resolve_obj_keys(key)
 
-        assert instance_id in self.gpu_contexts, (
+        assert instance_id in self.contexts, (
             f"KV cache not registered for GPU ID {instance_id}"
         )
-        gpu_context = self.gpu_contexts[instance_id]
-        model_name = self.gpu_context_meta[instance_id][0]
+        registered_context = self.contexts[instance_id]
+        gpu_context = registered_context.gpu_context
+        if gpu_context is None:
+            raise ValueError(
+                f"GPU KV cache not registered for instance ID {instance_id}"
+            )
+        model_name = registered_context.model_name
 
         # CPU-synchronous sentinel: a GPU retrieve is about to be enqueued.
         # Must be published via publish() (not publish_on_stream) so the
@@ -834,18 +869,11 @@ def _find_layout_desc(
 
         Returns:
             The layout descriptor, or None if no context matches
-            ``(model_name, world_size)``. GPU contexts are checked first,
-            then CPU contexts.
+            ``(model_name, world_size)``.
         """
-        for gpu_id, (m, w) in self.gpu_context_meta.items():
-            if m == model_name and w == world_size:
-                return get_layout_desc(
-                    self.gpu_contexts[gpu_id],
-                    self.chunk_size,
-                )
-        for instance_id, (m, w) in self.non_cuda_context_meta.items():
-            if m == model_name and w == world_size:
-                return self.non_cuda_contexts[instance_id].layout_desc
+        for context in self.contexts.values():
+            if context.model_name == model_name and context.world_size == world_size:
+                return context.get_layout_desc(self.chunk_size)
         return None
 
     def lookup(
@@ -1161,13 +1189,19 @@ def report_status(self) -> dict:
         sm = self.storage_manager.report_status()
 
         gpu_context_meta: dict[str, dict] = {}
-        for gpu_id, meta in self.gpu_context_meta.items():
+        non_cuda_context_meta: dict[str, dict] = {}
+        registered_gpu_ids: list[int] = []
+        registered_non_cuda_instance_ids: list[int] = []
+        for instance_id, context in self.contexts.items():
             entry: dict = {
-                "model_name": meta[0],
-                "world_size": meta[1],
+                "model_name": context.model_name,
+                "world_size": context.world_size,
             }
-            ctx = self.gpu_contexts.get(gpu_id)
-            if ctx is not None:
+            if context.is_gpu:
+                registered_gpu_ids.append(instance_id)
+                ctx = context.gpu_context
+                if ctx is None:
+                    continue
                 entry["kv_cache_layout"] = {
                     "num_layers": ctx.num_layers,
                     "inference_engine_logical_block_size": (
@@ -1185,27 +1219,26 @@ def report_status(self) -> dict:
                     "attention_backend": ctx.attention_backend,
                     "cache_size_per_token": ctx.cache_size_per_token(),
                 }
-            gpu_context_meta[str(gpu_id)] = entry
+                gpu_context_meta[str(instance_id)] = entry
+                continue
+
+            registered_non_cuda_instance_ids.append(instance_id)
+            non_cuda_metadata = context.non_cuda_metadata
+            if non_cuda_metadata is None:
+                continue
+            entry["block_size"] = non_cuda_metadata.block_size
+            entry["use_mla"] = non_cuda_metadata.use_mla
+            non_cuda_context_meta[str(instance_id)] = entry
 
         return {
             "is_healthy": sm["is_healthy"],
             "engine_type": self.__class__.__name__,
             "chunk_size": self.chunk_size,
             "hash_algorithm": self.token_hasher.hash_algorithm_name,
-            "registered_gpu_ids": list(self.gpu_contexts.keys()),
+            "registered_gpu_ids": registered_gpu_ids,
             "gpu_context_meta": gpu_context_meta,
-            "registered_non_cuda_instance_ids": list(self.non_cuda_contexts.keys()),
-            "non_cuda_context_meta": {
-                str(instance_id): {
-                    "model_name": model_name,
-                    "world_size": world_size,
-                    "block_size": self.non_cuda_contexts[instance_id].block_size,
-                    "use_mla": self.non_cuda_contexts[instance_id].use_mla,
-                }
-                for instance_id, (model_name, world_size) in (
-                    self.non_cuda_context_meta.items()
-                )
-            },
+            "registered_non_cuda_instance_ids": registered_non_cuda_instance_ids,
+            "non_cuda_context_meta": non_cuda_context_meta,
             "active_sessions": self.session_manager.active_count(),
             "active_prefetch_jobs": self._active_prefetch_count(),
             "storage_manager": sm,
@@ -1257,7 +1290,7 @@ def close(self) -> None:
         logger.info("MPCacheEngine closed")
 
         # Release GPU contexts
-        self.gpu_contexts.clear()
+        self.contexts.clear()
 
     def _active_prefetch_count(self) -> int:
         """Return the number of active prefetch jobs (thread-safe)."""
diff --git a/tests/v1/multiprocess/test_http_server.py b/tests/v1/multiprocess/test_http_server.py
index aa6d6659553..fd4de11be91 100644
--- a/tests/v1/multiprocess/test_http_server.py
+++ b/tests/v1/multiprocess/test_http_server.py
@@ -142,6 +142,15 @@ def test_no_gpu_contexts(self, client_with_engine, mock_engine):
         resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1")
         assert resp.status_code == 501
 
+    def test_context_registry_fallback(
+        self, client_with_engine, mock_engine, mock_gpu_ctx
+    ):
+        """Engine contexts registry is accepted when gpu_contexts is absent."""
+        mock_engine.gpu_contexts = None
+        mock_engine.contexts = {0: MagicMock(gpu_context=mock_gpu_ctx)}
+        resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1")
+        assert resp.status_code == 200
+
     def test_unknown_instance_id(self, client_with_engine):
         """404 when instance_id is not registered."""
         resp = client_with_engine.get(

From 59f44d34f3d75cda2e71616965d7cdf3eb9d9dc1 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 15 May 2026 05:47:16 +0000
Subject: [PATCH 3/3] chore: address review feedback on unified context
 registry

Agent-Logs-Url: https://github.com/hlin99/LMCache/sessions/1778bac4-9474-4391-9ed4-611344f5567f

Co-authored-by: hlin99 <73271530+hlin99@users.noreply.github.com>
---
 lmcache/v1/multiprocess/blend_server_v2.py    |  3 +++
 .../v1/multiprocess/http_apis/cache_api.py    |  5 ++++-
 lmcache/v1/multiprocess/server.py             | 20 ++++++++++++++-----
 tests/v1/multiprocess/test_http_server.py     |  9 +++++++++
 4 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/lmcache/v1/multiprocess/blend_server_v2.py b/lmcache/v1/multiprocess/blend_server_v2.py
index 803a83930e7..ba34e3fa5a9 100644
--- a/lmcache/v1/multiprocess/blend_server_v2.py
+++ b/lmcache/v1/multiprocess/blend_server_v2.py
@@ -106,8 +106,11 @@
 class _CBRegisteredContext:
     """Registered CB GPU context metadata."""
 
+    # Model identity for resolving CB layout desc during lookup.
     model_name: str
+    # World size used with model_name to match CB layout.
     world_size: int
+    # GPU context for CB store/retrieve operations.
     gpu_context: PlainGPUCacheContext
 
 
diff --git a/lmcache/v1/multiprocess/http_apis/cache_api.py b/lmcache/v1/multiprocess/http_apis/cache_api.py
index 25bddb995f0..729790dca89 100644
--- a/lmcache/v1/multiprocess/http_apis/cache_api.py
+++ b/lmcache/v1/multiprocess/http_apis/cache_api.py
@@ -120,11 +120,14 @@ async def kvcache_check(
     gpu_ctxs = getattr(engine, "gpu_contexts", None)
     if gpu_ctxs is None:
         contexts = getattr(engine, "contexts", None)
+        # Unified registry fallback: contexts is expected to be
+        # dict[int, RegisteredContext]-like, where each value may expose a
+        # nullable ``gpu_context`` attribute.
         if isinstance(contexts, dict):
             gpu_ctxs = {
                 instance_id: context.gpu_context
                 for instance_id, context in contexts.items()
-                if context.gpu_context is not None
+                if hasattr(context, "gpu_context") and context.gpu_context is not None
             }
     if gpu_ctxs is None:
         return JSONResponse(
diff --git a/lmcache/v1/multiprocess/server.py b/lmcache/v1/multiprocess/server.py
index d65c1f3d47d..aa3ff1dee64 100644
--- a/lmcache/v1/multiprocess/server.py
+++ b/lmcache/v1/multiprocess/server.py
@@ -180,9 +180,13 @@ class _PrefetchJob:
 class RegisteredContext:
     """Registry entry for a registered worker cache context."""
 
+    # Model identity for resolving layout desc during lookup.
     model_name: str
+    # World size used with model_name to match cache layout.
     world_size: int
+    # GPU context for CUDA IPC registrations; None for non-CUDA registrations.
     gpu_context: GPUCacheContext | None = None
+    # Non-CUDA metadata for CPU context registrations; None for GPU registrations.
     non_cuda_metadata: NonGpuContextMetadata | None = None
 
     @property
@@ -196,7 +200,11 @@ def get_layout_desc(self, chunk_size: int) -> MemoryLayoutDesc:
             return get_layout_desc(self.gpu_context, chunk_size)
         if self.non_cuda_metadata is not None:
             return self.non_cuda_metadata.layout_desc
-        raise ValueError("RegisteredContext has neither gpu_context nor non_cuda")
+        raise ValueError(
+            "RegisteredContext must have either gpu_context or "
+            "non_cuda_metadata, but both are None. Register via "
+            "register_kv_cache or register_kv_cache_non_gpu_context."
+        )
 
 
 # Main class for the mp cache engine
@@ -296,11 +304,13 @@ def unregister_kv_cache(self, instance_id: int) -> None:
         """
         context = self.contexts.pop(instance_id, None)
         if context is None:
-            logger.warning("No KV cache found for GPU ID %d to unregister", instance_id)
+            logger.warning(
+                "No context found for instance ID %d to unregister", instance_id
+            )
             return
 
         if context.is_gpu:
-            logger.info("Unregistered KV cache for GPU ID %d", instance_id)
+            logger.info("Unregistered KV cache for instance ID %d", instance_id)
             torch_dev.empty_cache()
         else:
             logger.info("Unregistered non-CUDA context for instance ID %d", instance_id)
@@ -499,7 +509,7 @@ def store(
         obj_keys = self._resolve_obj_keys(key)
 
         assert instance_id in self.contexts, (
-            f"KV cache not registered for GPU ID {instance_id}"
+            f"KV cache not registered for instance ID {instance_id}"
         )
         registered_context = self.contexts[instance_id]
         gpu_context = registered_context.gpu_context
@@ -687,7 +697,7 @@ def retrieve(
         obj_keys = self._resolve_obj_keys(key)
 
         assert instance_id in self.contexts, (
-            f"KV cache not registered for GPU ID {instance_id}"
+            f"KV cache not registered for instance ID {instance_id}"
         )
         registered_context = self.contexts[instance_id]
         gpu_context = registered_context.gpu_context
diff --git a/tests/v1/multiprocess/test_http_server.py b/tests/v1/multiprocess/test_http_server.py
index fd4de11be91..44aca13189c 100644
--- a/tests/v1/multiprocess/test_http_server.py
+++ b/tests/v1/multiprocess/test_http_server.py
@@ -151,6 +151,15 @@ def test_context_registry_fallback(
         resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1")
         assert resp.status_code == 200
 
+    def test_context_registry_ignores_non_gpu_entries(
+        self, client_with_engine, mock_engine
+    ):
+        """Entries with ``gpu_context=None`` are filtered from fallback registry."""
+        mock_engine.gpu_contexts = None
+        mock_engine.contexts = {0: MagicMock(gpu_context=None)}
+        resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1")
+        assert resp.status_code == 404
+
     def test_unknown_instance_id(self, client_with_engine):
         """404 when instance_id is not registered."""
         resp = client_with_engine.get(