hlin99 · hlin99 · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 22, 2026
diff --git a/.buildkite/k3_tests/multiprocess/pipeline.yml b/.buildkite/k3_tests/multiprocess/pipeline.yml
@@ -64,3 +64,10 @@ steps:
         agents: { queue: "k8s" }
         plugins: [{ kubernetes: { podSpec: *pod-2gpu } }]
         artifact_paths: ["*.log"]
+
+      - label: ":compression: cache_stats"
+        command: .buildkite/k3_tests/multiprocess/run.sh cache_stats
+        timeout_in_minutes: 30
+        agents: { queue: "k8s" }
+        plugins: [{ kubernetes: { podSpec: *pod-2gpu } }]
+        artifact_paths: ["*.log"]
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-cache-stats.sh b/.buildkite/k3_tests/multiprocess/scripts/run-cache-stats.sh
@@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+# Test that kv_transfer_params / cached_token_stats flows end-to-end
+# through the OpenAI-compatible API when LMCache MP mode is active.
+#
+# Flow:
+#   1. Send a long prompt (cold — populates LMCache, no cache hit)
+#   2. Send the same prompt again (warm — should hit LMCache)
+#   3. Verify the response contains cached_token_stats with expected values
+set -e
+set -o pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../../.." && pwd)"
+
+source "${REPO_ROOT}/.buildkite/k3_tests/common_scripts/helpers.sh"
+
+# Configuration (inherited from run-single-test.sh)
+VLLM_PORT="${VLLM_PORT:-8000}"
+MODEL="${MODEL:-Qwen/Qwen3-14B}"
+BUILD_ID="${BUILD_ID:-local_$$}"
+RESULTS_DIR="${RESULTS_DIR:-/tmp/lmcache_ci_results_${BUILD_ID}}"
+
+STATS_DIR="$RESULTS_DIR/cache_stats"
+mkdir -p "$STATS_DIR"
+
+echo "=== Cache Stats Reporting Test ==="
+echo "Model: $MODEL"
+echo "vLLM Port: $VLLM_PORT"
+echo "Results dir: $STATS_DIR"
+echo ""
+
+# Build a prompt long enough to span multiple LMCache chunks (default
+# chunk_size=256 tokens). Repeating a sentence gives us ~600+ tokens.
+LONG_CONTENT="Explain the history of computer science in great detail. $(printf 'The Turing machine is a fundamental concept in theoretical computer science that defines an abstract machine capable of manipulating symbols on a strip of tape according to a table of rules. %.0s' {1..20})"
+
+send_request() {
+    local label="$1"
+    local output_file="$2"
+
+    echo "--- Sending request: $label ---"
+    local http_code
+    http_code=$(curl -s -o "$output_file" -w "%{http_code}" \
+        -X POST "http://localhost:${VLLM_PORT}/v1/chat/completions" \
+        -H "Content-Type: application/json" \
+        -d "{
+            \"model\": \"${MODEL}\",
+            \"messages\": [{\"role\": \"user\", \"content\": $(python3 -c "import json; print(json.dumps('$LONG_CONTENT'))")}],
+            \"max_tokens\": 1,
+            \"kv_transfer_params\": {\"cached_token_stats\": true}
+        }")
+
+    if [ "$http_code" -ne 200 ]; then
+        echo "FAIL: $label returned HTTP $http_code"
+        cat "$output_file"
+        return 1
+    fi
+    echo "$label: HTTP 200 OK"
+}
+
+validate_stats_present() {
+    local label="$1"
+    local response_file="$2"
+
+    python3 -c "
+import json, sys
+
+with open('$response_file') as f:
+    data = json.load(f)
+
+kv_params = data.get('kv_transfer_params')
+if kv_params is None:
+    print('FAIL: $label — kv_transfer_params is missing from response')
+    sys.exit(1)
+
+stats = kv_params.get('cached_token_stats')
+if stats is None:
+    print('FAIL: $label — cached_token_stats is missing from kv_transfer_params')
+    print(f'  kv_transfer_params = {kv_params}')
+    sys.exit(1)
+
+required_keys = [
+    'num_vllm_cached_tokens',
+    'num_lmcache_cached_tokens',
+    'num_lmcache_extra_cached_tokens',
+]
+missing = [k for k in required_keys if k not in stats]
+if missing:
+    print(f'FAIL: $label — missing keys in cached_token_stats: {missing}')
+    print(f'  cached_token_stats = {stats}')
+    sys.exit(1)
+
+for k in required_keys:
+    v = stats[k]
+    if not isinstance(v, int) or v < 0:
+        print(f'FAIL: $label — {k} should be a non-negative integer, got {v!r}')
+        sys.exit(1)
+
+print(f'PASS: $label — cached_token_stats present with all required keys')
+print(f'  num_vllm_cached_tokens:          {stats[\"num_vllm_cached_tokens\"]}')
+print(f'  num_lmcache_cached_tokens:       {stats[\"num_lmcache_cached_tokens\"]}')
+print(f'  num_lmcache_extra_cached_tokens: {stats[\"num_lmcache_extra_cached_tokens\"]}')
+"
+}
+
+validate_warm_hit() {
+    local cold_file="$1"
+    local warm_file="$2"
+
+    python3 -c "
+import json, sys
+
+with open('$cold_file') as f:
+    cold = json.load(f)
+with open('$warm_file') as f:
+    warm = json.load(f)
+
+cold_stats = cold['kv_transfer_params']['cached_token_stats']
+warm_stats = warm['kv_transfer_params']['cached_token_stats']
+
+cold_lmcache = cold_stats['num_lmcache_cached_tokens']
+warm_lmcache = warm_stats['num_lmcache_cached_tokens']
+
+print(f'Cold request — num_lmcache_cached_tokens: {cold_lmcache}')
+print(f'Warm request — num_lmcache_cached_tokens: {warm_lmcache}')
+
+if warm_lmcache <= cold_lmcache:
+    print(f'FAIL: warm request should have more LMCache hits than cold request')
+    print(f'  cold={cold_lmcache}, warm={warm_lmcache}')
+    sys.exit(1)
+
+if warm_lmcache == 0:
+    print('FAIL: warm request has 0 LMCache cached tokens (cache not populated?)')
+    sys.exit(1)
+
+print(f'PASS: warm request has more LMCache hits ({warm_lmcache} > {cold_lmcache})')
+"
+}
+
+# ── Step 1: Cold request (populates LMCache) ──────────────────
+echo "============================================"
+echo "=== Step 1: Cold request ==="
+echo "============================================"
+if ! send_request "Cold" "$STATS_DIR/cold_response.json"; then
+    exit 1
+fi
+if ! validate_stats_present "Cold" "$STATS_DIR/cold_response.json"; then
+    exit 1
+fi
+echo ""
+
+# Small delay to let the store operation complete in LMCache
+sleep 2
+
+# ── Step 2: Warm request (same prompt, should hit cache) ──────
+echo "============================================"
+echo "=== Step 2: Warm request ==="
+echo "============================================"
+if ! send_request "Warm" "$STATS_DIR/warm_response.json"; then
+    exit 1
+fi
+if ! validate_stats_present "Warm" "$STATS_DIR/warm_response.json"; then
+    exit 1
+fi
+echo ""
+
+# ── Step 3: Validate cache hit improvement ────────────────────
+echo "============================================"
+echo "=== Step 3: Validate cache hit ==="
+echo "============================================"
+if ! validate_warm_hit "$STATS_DIR/cold_response.json" "$STATS_DIR/warm_response.json"; then
+    exit 1
+fi
+echo ""
+
+# ── Step 4: Verify opt-in behavior ────────────────────────────
+# Request WITHOUT kv_transfer_params should NOT have stats in response.
+echo "============================================"
+echo "=== Step 4: Verify opt-in (no stats without opt-in) ==="
+echo "============================================"
+
+echo "--- Sending request without kv_transfer_params ---"
+http_code=$(curl -s -o "$STATS_DIR/no_opt_in_response.json" -w "%{http_code}" \
+    -X POST "http://localhost:${VLLM_PORT}/v1/chat/completions" \
+    -H "Content-Type: application/json" \
+    -d "{
+        \"model\": \"${MODEL}\",
+        \"messages\": [{\"role\": \"user\", \"content\": \"Hello, how are you?\"}],
+        \"max_tokens\": 1
+    }")
+
+if [ "$http_code" -ne 200 ]; then
+    echo "FAIL: no-opt-in request returned HTTP $http_code"
+    exit 1
+fi
+
+python3 -c "
+import json, sys
+
+with open('$STATS_DIR/no_opt_in_response.json') as f:
+    data = json.load(f)
+
+kv_params = data.get('kv_transfer_params')
+if kv_params is not None:
+    print(f'FAIL: kv_transfer_params should be absent without opt-in, got {kv_params}')
+    sys.exit(1)
+
+print('PASS: kv_transfer_params correctly absent when not opted in')
+"
+echo ""
+
+# ── Summary ───────────────────────────────────────────────────
+echo "============================================"
+echo "=== Cache Stats Reporting Test PASSED ==="
+echo "============================================"
diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-long-doc-qa-l2.sh b/.buildkite/k3_tests/multiprocess/scripts/run-long-doc-qa-l2.sh
@@ -43,9 +43,12 @@ L2_MAX_SIZE_GB="${L2_MAX_SIZE_GB:-80}"
 L2_BANDWIDTH_GB="${L2_BANDWIDTH_GB:-4}"
 
 # L2 performance thresholds
-MIN_L2_SPEEDUP="${MIN_L2_SPEEDUP:-1.0}"
-MIN_L2_TTFT_SPEEDUP="${MIN_L2_TTFT_SPEEDUP:-1.0}"
-MAX_WARMUP_OVERHEAD="${MAX_WARMUP_OVERHEAD:-2.0}"
+# Recent CI runs show ~1.51-1.67x query speedup, ~1.77-2.02x TTFT speedup,
+# and ~0.87-0.99x warmup overhead. Tighten from the previous pass-anything
+# thresholds (1.0x/1.0x/2.0x) while leaving headroom for variance.
+MIN_L2_SPEEDUP="${MIN_L2_SPEEDUP:-1.3}"
+MIN_L2_TTFT_SPEEDUP="${MIN_L2_TTFT_SPEEDUP:-1.5}"
+MAX_WARMUP_OVERHEAD="${MAX_WARMUP_OVERHEAD:-1.2}"
 
 L2_RESULTS_DIR="$RESULTS_DIR/long_doc_qa_l2"
 PID_FILE="/tmp/lmcache_mp_pids_${BUILD_ID}"

diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-long-doc-qa.sh b/.buildkite/k3_tests/multiprocess/scripts/run-long-doc-qa.sh
@@ -27,9 +27,11 @@ SHUFFLE_SEED="${SHUFFLE_SEED:-0}"
 MAX_INFLIGHT_REQUESTS="${MAX_INFLIGHT_REQUESTS:-5}"
 
 # Relative performance thresholds (compared against baseline run in same job)
-# Allow at most 10% slower than baseline for both metrics
-MAX_TTFT_SLOWDOWN_PCT="${MAX_TTFT_SLOWDOWN_PCT:-10}"
-MAX_ROUND_TIME_SLOWDOWN_PCT="${MAX_ROUND_TIME_SLOWDOWN_PCT:-10}"
+# Negative values mean LMCache must be *faster* than baseline by at least that %.
+# Recent CI runs show ~77-84% TTFT improvement and ~27-40% round-time improvement,
+# so requiring 60% and 15% respectively leaves comfortable headroom.
+MAX_TTFT_SLOWDOWN_PCT="${MAX_TTFT_SLOWDOWN_PCT:--60}"
+MAX_ROUND_TIME_SLOWDOWN_PCT="${MAX_ROUND_TIME_SLOWDOWN_PCT:--15}"
 
 # Output directory
 LONG_DOC_QA_DIR="$RESULTS_DIR/long_doc_qa"
@@ -43,9 +45,9 @@ echo "Number of documents: $NUM_DOCUMENTS"
 echo "Output length: $OUTPUT_LEN"
 echo "Results dir: $LONG_DOC_QA_DIR"
 echo ""
-echo "Performance thresholds (relative to baseline):"
-echo "  Max TTFT slowdown: ${MAX_TTFT_SLOWDOWN_PCT}%"
-echo "  Max query round time slowdown: ${MAX_ROUND_TIME_SLOWDOWN_PCT}%"
+echo "Performance thresholds (relative to baseline, negative = must be faster):"
+echo "  Max TTFT slowdown: ${MAX_TTFT_SLOWDOWN_PCT}% (LMCache must be >= $(echo "$MAX_TTFT_SLOWDOWN_PCT" | tr -d '-')% faster)"
+echo "  Max round time slowdown: ${MAX_ROUND_TIME_SLOWDOWN_PCT}% (LMCache must be >= $(echo "$MAX_ROUND_TIME_SLOWDOWN_PCT" | tr -d '-')% faster)"
 echo ""
 
 mkdir -p "$LONG_DOC_QA_DIR"
@@ -196,12 +198,16 @@ def check_metric(name, lmcache_val, baseline_val, max_slowdown_pct):
         print(f"{name}: unable to compare (lmcache={lmcache_val}, baseline={baseline_val}) -- FAIL")
         return False
     pct = ((lmc - base) / base) * 100
+    label = f"{abs(pct):.1f}% faster" if pct < 0 else f"{pct:.1f}% slower"
+    if max_slowdown_pct < 0:
+        threshold_label = f"need >= {abs(max_slowdown_pct):.0f}% faster"
+    else:
+        threshold_label = f"max {max_slowdown_pct}% slower"
     if pct <= max_slowdown_pct:
-        label = f"{abs(pct):.1f}% faster" if pct < 0 else f"{pct:.1f}% slower"
-        print(f"{name}: {lmc:.4f}s vs baseline {base:.4f}s ({label}, max {max_slowdown_pct}% slower) -- PASS")
+        print(f"{name}: {lmc:.4f}s vs baseline {base:.4f}s ({label}, {threshold_label}) -- PASS")
         return True
     else:
-        print(f"{name}: {lmc:.4f}s vs baseline {base:.4f}s ({pct:.1f}% slower, max {max_slowdown_pct}% slower) -- FAIL")
+        print(f"{name}: {lmc:.4f}s vs baseline {base:.4f}s ({label}, {threshold_label}) -- FAIL")
         return False
 
 failed = False

diff --git a/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh b/.buildkite/k3_tests/multiprocess/scripts/run-single-test.sh
@@ -94,9 +94,12 @@ case "$TEST_NAME" in
     restart_recovery)
         exec_script="${SCRIPT_DIR}/run-restart-recovery.sh"
         ;;
+    cache_stats)
+        exec_script="${SCRIPT_DIR}/run-cache-stats.sh"
+        ;;
     *)
         echo "Unknown test: $TEST_NAME"
-        echo "Valid tests: lm_eval, vllm_bench, long_doc_qa, long_doc_qa_l2, fault_tolerance, deadlock, restart_recovery"
+        echo "Valid tests: lm_eval, vllm_bench, long_doc_qa, long_doc_qa_l2, fault_tolerance, deadlock, restart_recovery, cache_stats"
         exit 1
         ;;
 esac

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -25,6 +25,7 @@ steps:
         export CXX=hipcc
         export BUILD_WITH_HIP=1
         uv pip install torch torchvision --index-url https://download.pytorch.org/whl/rocm7.0
+        uv pip install -r requirements/rocm_core.txt
       fi
 
       uv pip install -r requirements/common.txt

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -338,6 +338,12 @@ jobs:
                 --tag lmcache/vllm-openai:latest --tag lmcache/vllm-openai:${{ env.LATEST_TAG }} \
                 --file docker/Dockerfile .
 
+            # `lmcache --help` exercises eager CLI subcommand discovery,
+            # so a missing runtime dep fails the build before the image ships.
+            - name: Smoke test lmcache/vllm-openai (cu13)
+              run: |
+                docker run --rm --entrypoint lmcache lmcache/vllm-openai:${{ env.LATEST_TAG }} --help
+
             - name: Push lmcache/vllm-openai container image to DockerHub
               run: |
                 docker push lmcache/vllm-openai:latest
@@ -354,6 +360,10 @@ jobs:
                 --tag lmcache/vllm-openai:lightweight --tag lmcache/vllm-openai:${{ env.LATEST_TAG }}-lightweight \
                 --file docker/Dockerfile.lightweight .
 
+            - name: Smoke test lmcache/vllm-openai:lightweight
+              run: |
+                docker run --rm --entrypoint lmcache lmcache/vllm-openai:${{ env.LATEST_TAG }}-lightweight --help
+
             - name: Push lmcache/vllm-openai:lightweight image to DockerHub
               run: |
                 docker push lmcache/vllm-openai:lightweight
@@ -375,6 +385,10 @@ jobs:
                 --tag lmcache/standalone:latest-cu130 --tag lmcache/standalone:${{ env.LATEST_TAG }}-cu130 \
                 --file docker/Dockerfile.standalone .
 
+            - name: Smoke test lmcache/standalone (cu13)
+              run: |
+                docker run --rm --entrypoint lmcache lmcache/standalone:${{ env.LATEST_TAG }} --help
+
             - name: Push lmcache/standalone container image to DockerHub
               run: |
                 docker push lmcache/standalone:latest
@@ -398,6 +412,11 @@ jobs:
                 --tag lmcache/vllm-openai:latest-cu129 --tag lmcache/vllm-openai:${{ env.LATEST_TAG }}-cu129 \
                 --file docker/Dockerfile .
 
+            - name: Smoke test lmcache/vllm-openai (cu12.9)
+              if: needs.publish-cu129-github-release.result == 'success'
+              run: |
+                docker run --rm --entrypoint lmcache lmcache/vllm-openai:${{ env.LATEST_TAG }}-cu129 --help
+
             - name: Push lmcache/vllm-openai cu129 container image to DockerHub
               if: needs.publish-cu129-github-release.result == 'success'
               run: |
@@ -419,6 +438,11 @@ jobs:
                 --tag lmcache/standalone:latest-cu129 --tag lmcache/standalone:${{ env.LATEST_TAG }}-cu129 \
                 --file docker/Dockerfile.standalone .
 
+            - name: Smoke test lmcache/standalone (cu12.9)
+              if: needs.publish-cu129-github-release.result == 'success'
+              run: |
+                docker run --rm --entrypoint lmcache lmcache/standalone:${{ env.LATEST_TAG }}-cu129 --help
+
             - name: Push lmcache/standalone cu129 container image to DockerHub
               if: needs.publish-cu129-github-release.result == 'success'
               run: |