From 8de6bb8fe8490bbd42ddc7239b3023a4a4abdd5f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 15 May 2026 05:31:48 +0000 Subject: [PATCH 1/3] Initial plan From 57922ba2af50fd158a16402c3c9597c2ea7a412c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 15 May 2026 05:42:15 +0000 Subject: [PATCH 2/3] refactor: unify multiprocess context registries Agent-Logs-Url: https://github.com/hlin99/LMCache/sessions/1778bac4-9474-4391-9ed4-611344f5567f Co-authored-by: hlin99 <73271530+hlin99@users.noreply.github.com> --- lmcache/v1/multiprocess/blend_server_v2.py | 88 +++++----- .../v1/multiprocess/http_apis/cache_api.py | 8 + lmcache/v1/multiprocess/server.py | 163 +++++++++++------- tests/v1/multiprocess/test_http_server.py | 9 + 4 files changed, 163 insertions(+), 105 deletions(-) diff --git a/lmcache/v1/multiprocess/blend_server_v2.py b/lmcache/v1/multiprocess/blend_server_v2.py index 957562272e7..803a83930e7 100644 --- a/lmcache/v1/multiprocess/blend_server_v2.py +++ b/lmcache/v1/multiprocess/blend_server_v2.py @@ -38,6 +38,7 @@ """ # Standard +from dataclasses import dataclass from typing import Any import threading import time @@ -101,6 +102,15 @@ logger = init_logger(__name__) +@dataclass +class _CBRegisteredContext: + """Registered CB GPU context metadata.""" + + model_name: str + world_size: int + gpu_context: PlainGPUCacheContext + + class BlendTokenRangeMatcher: """Fast token-range matcher using polynomial rolling/chunk hashes and a direct-address lookup table. @@ -390,11 +400,9 @@ def __init__( storage_manager_config, chunk_size, hash_algorithm=hash_algorithm ) - self._cb_gpu_contexts: dict[int, PlainGPUCacheContext] = {} - - # CB GPU ID -> (model name, world size) as metadata + # CB instance ID -> registered context # NOTE: This is mainly for determining the layout desc during prefetch - self._cb_gpu_context_meta: dict[int, tuple[str, int]] = {} + self._cb_contexts: dict[int, _CBRegisteredContext] = {} # Fast local matcher: indexes pre-computed chunk hashes for sub-sequence lookup self._token_range_matcher = BlendTokenRangeMatcher(chunk_size) @@ -418,8 +426,11 @@ def cb_register_kv_cache( world_size: The world size associated with this KV cache. """ gpu_context = PlainGPUCacheContext(kv_caches, self.chunk_size) - self._cb_gpu_contexts[instance_id] = gpu_context - self._cb_gpu_context_meta[instance_id] = (model_name, world_size) + self._cb_contexts[instance_id] = _CBRegisteredContext( + model_name=model_name, + world_size=world_size, + gpu_context=gpu_context, + ) logger.info( "Registered CB KV cache for instance_id %d with %d layers", instance_id, @@ -451,9 +462,8 @@ def cb_unregister_kv_cache(self, instance_id: int) -> None: Args: instance_id: Unique identifier for the blend engine instance to unregister """ - if instance_id in self._cb_gpu_contexts: - del self._cb_gpu_contexts[instance_id] - del self._cb_gpu_context_meta[instance_id] + context = self._cb_contexts.pop(instance_id, None) + if context is not None: logger.info("Unregistered CB KV cache for instance_id %d", instance_id) else: logger.warning( @@ -471,30 +481,28 @@ def report_status(self) -> dict: status = super().report_status() cb_gpu_context_meta: dict[str, dict] = {} - for gpu_id, meta in self._cb_gpu_context_meta.items(): - model_name, world_size = meta + for instance_id, context in self._cb_contexts.items(): entry: dict = { - "model_name": model_name, - "world_size": world_size, + "model_name": context.model_name, + "world_size": context.world_size, } - ctx = self._cb_gpu_contexts.get(gpu_id) - if ctx is not None: - # bytes per token = 2 (K+V) * num_layers * hidden_dim_size * - # itemsize; num_tokens is the cache capacity, not a per-token - # cost. - cache_size_per_token = ( - 2 * ctx.num_layers * ctx.hidden_dim_size * ctx.dtype.itemsize - ) - entry["kv_cache_layout"] = { - "num_layers": ctx.num_layers, - "num_tokens": ctx.num_tokens, - "hidden_dim_size": ctx.hidden_dim_size, - "dtype": str(ctx.dtype), - "cache_size_per_token": cache_size_per_token, - } - cb_gpu_context_meta[str(gpu_id)] = entry - - status["registered_cb_gpu_ids"] = list(self._cb_gpu_contexts.keys()) + ctx = context.gpu_context + # bytes per token = 2 (K+V) * num_layers * hidden_dim_size * + # itemsize; num_tokens is the cache capacity, not a per-token + # cost. + cache_size_per_token = ( + 2 * ctx.num_layers * ctx.hidden_dim_size * ctx.dtype.itemsize + ) + entry["kv_cache_layout"] = { + "num_layers": ctx.num_layers, + "num_tokens": ctx.num_tokens, + "hidden_dim_size": ctx.hidden_dim_size, + "dtype": str(ctx.dtype), + "cache_size_per_token": cache_size_per_token, + } + cb_gpu_context_meta[str(instance_id)] = entry + + status["registered_cb_gpu_ids"] = list(self._cb_contexts.keys()) status["cb_gpu_context_meta"] = cb_gpu_context_meta return status @@ -593,9 +601,9 @@ def cb_lookup_pre_computed(self, key: IPCCacheEngineKey) -> list[CBMatchResult]: # Find the cb gpu context and calculate the layout desc layout_desc: MemoryLayoutDesc | None = None - for gpu_id, (m_name, w_size) in self._cb_gpu_context_meta.items(): - if m_name == model_name and w_size == world_size: - cb_ctx = self._cb_gpu_contexts[gpu_id] + for context in self._cb_contexts.values(): + if context.model_name == model_name and context.world_size == world_size: + cb_ctx = context.gpu_context layout_desc = MemoryLayoutDesc( shapes=[cb_ctx.get_kv_buffer_shape(self.chunk_size)], dtypes=[cb_ctx.dtype], @@ -845,10 +853,10 @@ def cb_store_pre_computed( """ num_tokens = key.end - key.start - assert instance_id in self._cb_gpu_contexts, ( + assert instance_id in self._cb_contexts, ( f"Instance ID {instance_id} not registered for CB KV cache" ) - gpu_context = self._cb_gpu_contexts[instance_id] + gpu_context = self._cb_contexts[instance_id].gpu_context # CPU-synchronous sentinel: GPU store is about to be enqueued. self._event_bus.publish( @@ -968,10 +976,10 @@ def cb_retrieve_pre_computed( Note: We must call `cb_lookup_pre_computed` first before calling this function """ - assert instance_id in self._cb_gpu_contexts, ( + assert instance_id in self._cb_contexts, ( f"Instance ID {instance_id} not registered for CB KV cache" ) - gpu_context = self._cb_gpu_contexts[instance_id] + gpu_context = self._cb_contexts[instance_id].gpu_context # One obj_key per match_result, in cur_st order cb_match_result = sorted(cb_match_result, key=lambda r: r.cur_st) @@ -1113,10 +1121,10 @@ def cb_store_final( num_tokens = key.end - key.start # Get GPU context - assert instance_id in self._cb_gpu_contexts, ( + assert instance_id in self._cb_contexts, ( f"Instance ID {instance_id} not registered for CB KV cache" ) - gpu_context = self._cb_gpu_contexts[instance_id] + gpu_context = self._cb_contexts[instance_id].gpu_context # CPU-synchronous sentinels: SUBMITTED before SESSION_END so the # tracing subscriber's in-flight counter is non-zero when SESSION_END diff --git a/lmcache/v1/multiprocess/http_apis/cache_api.py b/lmcache/v1/multiprocess/http_apis/cache_api.py index a1ea5e961cf..25bddb995f0 100644 --- a/lmcache/v1/multiprocess/http_apis/cache_api.py +++ b/lmcache/v1/multiprocess/http_apis/cache_api.py @@ -118,6 +118,14 @@ async def kvcache_check( ) gpu_ctxs = getattr(engine, "gpu_contexts", None) + if gpu_ctxs is None: + contexts = getattr(engine, "contexts", None) + if isinstance(contexts, dict): + gpu_ctxs = { + instance_id: context.gpu_context + for instance_id, context in contexts.items() + if context.gpu_context is not None + } if gpu_ctxs is None: return JSONResponse( status_code=501, diff --git a/lmcache/v1/multiprocess/server.py b/lmcache/v1/multiprocess/server.py index 07cb129bba5..d65c1f3d47d 100644 --- a/lmcache/v1/multiprocess/server.py +++ b/lmcache/v1/multiprocess/server.py @@ -176,6 +176,29 @@ class _PrefetchJob: cache_salt: str = "" +@dataclass +class RegisteredContext: + """Registry entry for a registered worker cache context.""" + + model_name: str + world_size: int + gpu_context: GPUCacheContext | None = None + non_cuda_metadata: NonGpuContextMetadata | None = None + + @property + def is_gpu(self) -> bool: + """Whether this entry represents a GPU cache context.""" + return self.gpu_context is not None + + def get_layout_desc(self, chunk_size: int) -> MemoryLayoutDesc: + """Return memory layout descriptor for this context.""" + if self.gpu_context is not None: + return get_layout_desc(self.gpu_context, chunk_size) + if self.non_cuda_metadata is not None: + return self.non_cuda_metadata.layout_desc + raise ValueError("RegisteredContext has neither gpu_context nor non_cuda") + + # Main class for the mp cache engine class MPCacheEngine: def __init__( @@ -184,16 +207,11 @@ def __init__( chunk_size: int = 256, hash_algorithm: str = "blake3", ): - # GPU ID -> KV cache tensors - self.gpu_contexts: dict[int, GPUCacheContext] = {} - - # GPU ID -> (model name, world size) as metadata + # Instance ID -> registered context # NOTE: This is mainly for determining the layout desc during prefetch # We assume that if the (model name, world size) is the same, then - # the layout desc returned by the gpu context is the same. - self.gpu_context_meta: dict[int, tuple[str, int]] = {} - self.non_cuda_contexts: dict[int, NonGpuContextMetadata] = {} - self.non_cuda_context_meta: dict[int, tuple[str, int]] = {} + # the layout desc returned by the context is the same. + self.contexts: dict[int, RegisteredContext] = {} # chunk size self.chunk_size = chunk_size @@ -244,7 +262,7 @@ def register_kv_cache( layout_hints: See :class:`LayoutHints`. Forwarded to :class:`GPUCacheContext` for GPU KV format detection. """ - if instance_id in self.gpu_contexts: + if instance_id in self.contexts: logger.warning( "Instance %s's KV cache is already registered, " "skipping the new registration", @@ -258,8 +276,11 @@ def register_kv_cache( layout_hints=layout_hints or None, engine_type=engine_type, ) - self.gpu_contexts[instance_id] = gpu_context - self.gpu_context_meta[instance_id] = (model_name, world_size) + self.contexts[instance_id] = RegisteredContext( + model_name=model_name, + world_size=world_size, + gpu_context=gpu_context, + ) logger.info( "Registered KV cache for GPU ID %d with %d layers", instance_id, @@ -273,17 +294,16 @@ def unregister_kv_cache(self, instance_id: int) -> None: Args: instance_id (int): The GPU instance ID (such as PID). """ - if instance_id in self.gpu_contexts: - del self.gpu_contexts[instance_id] - del self.gpu_context_meta[instance_id] + context = self.contexts.pop(instance_id, None) + if context is None: + logger.warning("No KV cache found for GPU ID %d to unregister", instance_id) + return + + if context.is_gpu: logger.info("Unregistered KV cache for GPU ID %d", instance_id) torch_dev.empty_cache() - elif instance_id in self.non_cuda_contexts: - del self.non_cuda_contexts[instance_id] - del self.non_cuda_context_meta[instance_id] - logger.info("Unregistered non-CUDA context for instance ID %d", instance_id) else: - logger.warning("No KV cache found for GPU ID %d to unregister", instance_id) + logger.info("Unregistered non-CUDA context for instance ID %d", instance_id) def register_kv_cache_non_gpu_context( self, @@ -325,12 +345,15 @@ def register_kv_cache_non_gpu_context( else torch.Size([2, num_layers, self.chunk_size, hidden_dim_size]) ) layout_desc = MemoryLayoutDesc(shapes=[shape], dtypes=[dtype]) - self.non_cuda_contexts[instance_id] = NonGpuContextMetadata( - layout_desc=layout_desc, - block_size=block_size, - use_mla=use_mla, + self.contexts[instance_id] = RegisteredContext( + model_name=model_name, + world_size=world_size, + non_cuda_metadata=NonGpuContextMetadata( + layout_desc=layout_desc, + block_size=block_size, + use_mla=use_mla, + ), ) - self.non_cuda_context_meta[instance_id] = (model_name, world_size) def _resolve_obj_keys(self, key: IPCCacheEngineKey) -> list[ObjectKey]: """Resolve object keys from an IPC cache key. @@ -375,14 +398,15 @@ def store_cpu_chunks( """ obj_keys = self._resolve_obj_keys(key) - if instance_id not in self.non_cuda_contexts: + context = self.contexts.get(instance_id) + if context is None or context.non_cuda_metadata is None: raise ValueError( f"non-CUDA context not registered for instance ID {instance_id}" ) - ctx = self.non_cuda_contexts[instance_id] + non_cuda_metadata = context.non_cuda_metadata chunks: list[torch.Tensor] = pickle.loads(cpu_data) reserved_dict = self.storage_manager.reserve_write( - obj_keys, ctx.layout_desc, "new" + obj_keys, non_cuda_metadata.layout_desc, "new" ) written_keys: list[ObjectKey] = [] try: @@ -426,7 +450,8 @@ def retrieve_cpu_chunks( """ obj_keys = self._resolve_obj_keys(key) - if instance_id not in self.non_cuda_contexts: + context = self.contexts.get(instance_id) + if context is None or context.non_cuda_metadata is None: raise ValueError( f"non-CUDA context not registered for instance ID {instance_id}" ) @@ -473,11 +498,16 @@ def store( st = time.perf_counter() obj_keys = self._resolve_obj_keys(key) - assert instance_id in self.gpu_contexts, ( + assert instance_id in self.contexts, ( f"KV cache not registered for GPU ID {instance_id}" ) - gpu_context = self.gpu_contexts[instance_id] - model_name = self.gpu_context_meta[instance_id][0] + registered_context = self.contexts[instance_id] + gpu_context = registered_context.gpu_context + if gpu_context is None: + raise ValueError( + f"GPU KV cache not registered for instance ID {instance_id}" + ) + model_name = registered_context.model_name # ``blocks_per_chunk`` is counted in inference-engine-side # blocks (each block addresses @@ -656,11 +686,16 @@ def retrieve( st = time.perf_counter() obj_keys = self._resolve_obj_keys(key) - assert instance_id in self.gpu_contexts, ( + assert instance_id in self.contexts, ( f"KV cache not registered for GPU ID {instance_id}" ) - gpu_context = self.gpu_contexts[instance_id] - model_name = self.gpu_context_meta[instance_id][0] + registered_context = self.contexts[instance_id] + gpu_context = registered_context.gpu_context + if gpu_context is None: + raise ValueError( + f"GPU KV cache not registered for instance ID {instance_id}" + ) + model_name = registered_context.model_name # CPU-synchronous sentinel: a GPU retrieve is about to be enqueued. # Must be published via publish() (not publish_on_stream) so the @@ -834,18 +869,11 @@ def _find_layout_desc( Returns: The layout descriptor, or None if no context matches - ``(model_name, world_size)``. GPU contexts are checked first, - then CPU contexts. + ``(model_name, world_size)``. """ - for gpu_id, (m, w) in self.gpu_context_meta.items(): - if m == model_name and w == world_size: - return get_layout_desc( - self.gpu_contexts[gpu_id], - self.chunk_size, - ) - for instance_id, (m, w) in self.non_cuda_context_meta.items(): - if m == model_name and w == world_size: - return self.non_cuda_contexts[instance_id].layout_desc + for context in self.contexts.values(): + if context.model_name == model_name and context.world_size == world_size: + return context.get_layout_desc(self.chunk_size) return None def lookup( @@ -1161,13 +1189,19 @@ def report_status(self) -> dict: sm = self.storage_manager.report_status() gpu_context_meta: dict[str, dict] = {} - for gpu_id, meta in self.gpu_context_meta.items(): + non_cuda_context_meta: dict[str, dict] = {} + registered_gpu_ids: list[int] = [] + registered_non_cuda_instance_ids: list[int] = [] + for instance_id, context in self.contexts.items(): entry: dict = { - "model_name": meta[0], - "world_size": meta[1], + "model_name": context.model_name, + "world_size": context.world_size, } - ctx = self.gpu_contexts.get(gpu_id) - if ctx is not None: + if context.is_gpu: + registered_gpu_ids.append(instance_id) + ctx = context.gpu_context + if ctx is None: + continue entry["kv_cache_layout"] = { "num_layers": ctx.num_layers, "inference_engine_logical_block_size": ( @@ -1185,27 +1219,26 @@ def report_status(self) -> dict: "attention_backend": ctx.attention_backend, "cache_size_per_token": ctx.cache_size_per_token(), } - gpu_context_meta[str(gpu_id)] = entry + gpu_context_meta[str(instance_id)] = entry + continue + + registered_non_cuda_instance_ids.append(instance_id) + non_cuda_metadata = context.non_cuda_metadata + if non_cuda_metadata is None: + continue + entry["block_size"] = non_cuda_metadata.block_size + entry["use_mla"] = non_cuda_metadata.use_mla + non_cuda_context_meta[str(instance_id)] = entry return { "is_healthy": sm["is_healthy"], "engine_type": self.__class__.__name__, "chunk_size": self.chunk_size, "hash_algorithm": self.token_hasher.hash_algorithm_name, - "registered_gpu_ids": list(self.gpu_contexts.keys()), + "registered_gpu_ids": registered_gpu_ids, "gpu_context_meta": gpu_context_meta, - "registered_non_cuda_instance_ids": list(self.non_cuda_contexts.keys()), - "non_cuda_context_meta": { - str(instance_id): { - "model_name": model_name, - "world_size": world_size, - "block_size": self.non_cuda_contexts[instance_id].block_size, - "use_mla": self.non_cuda_contexts[instance_id].use_mla, - } - for instance_id, (model_name, world_size) in ( - self.non_cuda_context_meta.items() - ) - }, + "registered_non_cuda_instance_ids": registered_non_cuda_instance_ids, + "non_cuda_context_meta": non_cuda_context_meta, "active_sessions": self.session_manager.active_count(), "active_prefetch_jobs": self._active_prefetch_count(), "storage_manager": sm, @@ -1257,7 +1290,7 @@ def close(self) -> None: logger.info("MPCacheEngine closed") # Release GPU contexts - self.gpu_contexts.clear() + self.contexts.clear() def _active_prefetch_count(self) -> int: """Return the number of active prefetch jobs (thread-safe).""" diff --git a/tests/v1/multiprocess/test_http_server.py b/tests/v1/multiprocess/test_http_server.py index aa6d6659553..fd4de11be91 100644 --- a/tests/v1/multiprocess/test_http_server.py +++ b/tests/v1/multiprocess/test_http_server.py @@ -142,6 +142,15 @@ def test_no_gpu_contexts(self, client_with_engine, mock_engine): resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1") assert resp.status_code == 501 + def test_context_registry_fallback( + self, client_with_engine, mock_engine, mock_gpu_ctx + ): + """Engine contexts registry is accepted when gpu_contexts is absent.""" + mock_engine.gpu_contexts = None + mock_engine.contexts = {0: MagicMock(gpu_context=mock_gpu_ctx)} + resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1") + assert resp.status_code == 200 + def test_unknown_instance_id(self, client_with_engine): """404 when instance_id is not registered.""" resp = client_with_engine.get( From 59f44d34f3d75cda2e71616965d7cdf3eb9d9dc1 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 15 May 2026 05:47:16 +0000 Subject: [PATCH 3/3] chore: address review feedback on unified context registry Agent-Logs-Url: https://github.com/hlin99/LMCache/sessions/1778bac4-9474-4391-9ed4-611344f5567f Co-authored-by: hlin99 <73271530+hlin99@users.noreply.github.com> --- lmcache/v1/multiprocess/blend_server_v2.py | 3 +++ .../v1/multiprocess/http_apis/cache_api.py | 5 ++++- lmcache/v1/multiprocess/server.py | 20 ++++++++++++++----- tests/v1/multiprocess/test_http_server.py | 9 +++++++++ 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/lmcache/v1/multiprocess/blend_server_v2.py b/lmcache/v1/multiprocess/blend_server_v2.py index 803a83930e7..ba34e3fa5a9 100644 --- a/lmcache/v1/multiprocess/blend_server_v2.py +++ b/lmcache/v1/multiprocess/blend_server_v2.py @@ -106,8 +106,11 @@ class _CBRegisteredContext: """Registered CB GPU context metadata.""" + # Model identity for resolving CB layout desc during lookup. model_name: str + # World size used with model_name to match CB layout. world_size: int + # GPU context for CB store/retrieve operations. gpu_context: PlainGPUCacheContext diff --git a/lmcache/v1/multiprocess/http_apis/cache_api.py b/lmcache/v1/multiprocess/http_apis/cache_api.py index 25bddb995f0..729790dca89 100644 --- a/lmcache/v1/multiprocess/http_apis/cache_api.py +++ b/lmcache/v1/multiprocess/http_apis/cache_api.py @@ -120,11 +120,14 @@ async def kvcache_check( gpu_ctxs = getattr(engine, "gpu_contexts", None) if gpu_ctxs is None: contexts = getattr(engine, "contexts", None) + # Unified registry fallback: contexts is expected to be + # dict[int, RegisteredContext]-like, where each value may expose a + # nullable ``gpu_context`` attribute. if isinstance(contexts, dict): gpu_ctxs = { instance_id: context.gpu_context for instance_id, context in contexts.items() - if context.gpu_context is not None + if hasattr(context, "gpu_context") and context.gpu_context is not None } if gpu_ctxs is None: return JSONResponse( diff --git a/lmcache/v1/multiprocess/server.py b/lmcache/v1/multiprocess/server.py index d65c1f3d47d..aa3ff1dee64 100644 --- a/lmcache/v1/multiprocess/server.py +++ b/lmcache/v1/multiprocess/server.py @@ -180,9 +180,13 @@ class _PrefetchJob: class RegisteredContext: """Registry entry for a registered worker cache context.""" + # Model identity for resolving layout desc during lookup. model_name: str + # World size used with model_name to match cache layout. world_size: int + # GPU context for CUDA IPC registrations; None for non-CUDA registrations. gpu_context: GPUCacheContext | None = None + # Non-CUDA metadata for CPU context registrations; None for GPU registrations. non_cuda_metadata: NonGpuContextMetadata | None = None @property @@ -196,7 +200,11 @@ def get_layout_desc(self, chunk_size: int) -> MemoryLayoutDesc: return get_layout_desc(self.gpu_context, chunk_size) if self.non_cuda_metadata is not None: return self.non_cuda_metadata.layout_desc - raise ValueError("RegisteredContext has neither gpu_context nor non_cuda") + raise ValueError( + "RegisteredContext must have either gpu_context or " + "non_cuda_metadata, but both are None. Register via " + "register_kv_cache or register_kv_cache_non_gpu_context." + ) # Main class for the mp cache engine @@ -296,11 +304,13 @@ def unregister_kv_cache(self, instance_id: int) -> None: """ context = self.contexts.pop(instance_id, None) if context is None: - logger.warning("No KV cache found for GPU ID %d to unregister", instance_id) + logger.warning( + "No context found for instance ID %d to unregister", instance_id + ) return if context.is_gpu: - logger.info("Unregistered KV cache for GPU ID %d", instance_id) + logger.info("Unregistered KV cache for instance ID %d", instance_id) torch_dev.empty_cache() else: logger.info("Unregistered non-CUDA context for instance ID %d", instance_id) @@ -499,7 +509,7 @@ def store( obj_keys = self._resolve_obj_keys(key) assert instance_id in self.contexts, ( - f"KV cache not registered for GPU ID {instance_id}" + f"KV cache not registered for instance ID {instance_id}" ) registered_context = self.contexts[instance_id] gpu_context = registered_context.gpu_context @@ -687,7 +697,7 @@ def retrieve( obj_keys = self._resolve_obj_keys(key) assert instance_id in self.contexts, ( - f"KV cache not registered for GPU ID {instance_id}" + f"KV cache not registered for instance ID {instance_id}" ) registered_context = self.contexts[instance_id] gpu_context = registered_context.gpu_context diff --git a/tests/v1/multiprocess/test_http_server.py b/tests/v1/multiprocess/test_http_server.py index fd4de11be91..44aca13189c 100644 --- a/tests/v1/multiprocess/test_http_server.py +++ b/tests/v1/multiprocess/test_http_server.py @@ -151,6 +151,15 @@ def test_context_registry_fallback( resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1") assert resp.status_code == 200 + def test_context_registry_ignores_non_gpu_entries( + self, client_with_engine, mock_engine + ): + """Entries with ``gpu_context=None`` are filtered from fallback registry.""" + mock_engine.gpu_contexts = None + mock_engine.contexts = {0: MagicMock(gpu_context=None)} + resp = client_with_engine.get("/kvcache/check?block_ids=0&chunk_size=1") + assert resp.status_code == 404 + def test_unknown_instance_id(self, client_with_engine): """404 when instance_id is not registered.""" resp = client_with_engine.get(