hlin99 · hlin99 · May 26, 2026 · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/.buildkite/k3_tests/unit/run.sh b/.buildkite/k3_tests/unit/run.sh
@@ -35,7 +35,6 @@ pytest --maxfail=1 --cov=lmcache \
     --cov-report term --cov-report=html:coverage-test \
     --cov-report=xml:coverage-test.xml --html=durations/test.html \
     --ignore=tests/disagg --ignore=tests/v1/test_pos_kernels.py \
-    --ignore=tests/v1/test_nixl_storage.py \
     --ignore=tests/skipped \
     --ignore=tests/v1/storage_backend/test_eic.py
 

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -37,7 +37,6 @@ steps:
         --cov-report term --cov-report=html:coverage-test \
         --cov-report=xml:coverage-test.xml --html=durations/test.html \
         --ignore=tests/disagg --ignore=tests/v1/test_pos_kernels.py \
-        --ignore=tests/v1/test_nixl_storage.py \
         --ignore=tests/v1/test_nixl_batched_contains.py \
         --ignore=tests/v1/test_device_id_race.py \
         --ignore=tests/skipped \

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -99,7 +99,7 @@ jobs:
 
       - name: "Run non-CUDA unit tests"
         run: |
-          pytest --ignore=tests/disagg --ignore=tests/v1/test_nixl_storage.py \
+          pytest --ignore=tests/disagg \
             --ignore=tests/v1/multiprocess/ \
             --ignore=tests/v1/distributed/ \
             --ignore=tests/v1/mp_observability/ \

diff --git a/AGENTS.md b/AGENTS.md
@@ -47,11 +47,10 @@ BUILD_WITH_HIP=1 pip install -e .
 ```bash
 # Run standard test suite (mirrors CI)
 pytest -xvs --ignore=tests/disagg \
-  --ignore=tests/v1/test_nixl_storage.py \
-  --ignore=tests/v1/multiprocess/ \
-  --ignore=tests/v1/distributed/ \
-  --ignore=tests/skipped \
-  --ignore=tests/v1/storage_backend/test_eic.py
+ --ignore=tests/v1/multiprocess/ \
+ --ignore=tests/v1/distributed/ \
+ --ignore=tests/skipped \
+ --ignore=tests/v1/storage_backend/test_eic.py
 
 # Run a single test file
 pytest -xvs tests/v1/test_cache_engine.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -47,7 +47,6 @@ BUILD_WITH_HIP=1 pip install -e .
 ```bash
 # Run standard test suite (mirrors CI)
 pytest -xvs --ignore=tests/disagg \
-  --ignore=tests/v1/test_nixl_storage.py \
   --ignore=tests/v1/multiprocess/ \
   --ignore=tests/v1/distributed/ \
   --ignore=tests/skipped \

diff --git a/csrc/mem_alloc.cpp b/csrc/mem_alloc.cpp
@@ -1,15 +1,35 @@
 #include <cuda_runtime.h>
 #include <stdexcept>
 #include <string>
+#include <cassert>
 #include <sys/mman.h>
 #include <sys/syscall.h>
+#include <linux/mman.h>
 #include <unistd.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <cstring>            // for strerror
 #include <linux/mempolicy.h>  // for MPOL_BIND, MPOL_MF_MOVE, MPOL_MF_STRICT
 #include "mem_alloc.h"
 
+static constexpr size_t HUGEPAGE_SIZE = 2UL * 1024 * 1024;  // MAP_HUGE_2MB
+
+static inline size_t _align_hugepage(size_t size) {
+  return (size + HUGEPAGE_SIZE - 1) & ~(HUGEPAGE_SIZE - 1);
+}
+
+static void* _mmap_anon(size_t size, bool hugepages) {
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  if (hugepages) {
+    flags |= MAP_HUGETLB | MAP_HUGE_2MB;
+  }
+  void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+  if (ptr == MAP_FAILED) {
+    throw std::runtime_error(std::string("mmap failed: ") + strerror(errno));
+  }
+  return ptr;
+}
+
 uintptr_t alloc_pinned_ptr(size_t size, unsigned int flags) {
   void* ptr = nullptr;
   cudaError_t err = cudaHostAlloc(&ptr, size, flags);
@@ -26,6 +46,36 @@ void free_pinned_ptr(uintptr_t ptr) {
   }
 }
 
+uintptr_t alloc_hugepage_pinned_ptr(size_t size, unsigned int flags) {
+  size = _align_hugepage(size);
+  void* ptr = _mmap_anon(size, true);
+
+  cudaError_t st = cudaHostRegister(ptr, size, flags);
+  if (st != cudaSuccess) {
+    munmap(ptr, size);
+    throw std::runtime_error(std::string("cudaHostRegister failed: ") +
+                             cudaGetErrorString(st));
+  }
+
+  return reinterpret_cast<uintptr_t>(ptr);
+}
+
+void free_hugepage_pinned_ptr(uintptr_t ptr, size_t size) {
+  size = _align_hugepage(size);
+  void* p = reinterpret_cast<void*>(ptr);
+
+  // Unpin first, then unmap.
+  cudaError_t st = cudaHostUnregister(p);
+  if (st != cudaSuccess) {
+    munmap(p, size);
+    throw std::runtime_error(std::string("cudaHostUnregister failed: ") +
+                             cudaGetErrorString(st));
+  }
+  if (munmap(p, size) != 0) {
+    throw std::runtime_error(std::string("munmap failed: ") + strerror(errno));
+  }
+}
+
 void batched_memcpy(const std::vector<uintptr_t>& src_ptrs,
                     const std::vector<uintptr_t>& dst_ptrs,
                     const std::vector<size_t>& sizes) {
@@ -43,10 +93,11 @@ void batched_memcpy(const std::vector<uintptr_t>& src_ptrs,
   }
 }
 
-static void first_touch(void* p, size_t size) {
-  const long ps = sysconf(_SC_PAGESIZE);
+static void first_touch(void* p, size_t size, bool hugepages) {
+  const size_t ps =
+      hugepages ? HUGEPAGE_SIZE : static_cast<size_t>(sysconf(_SC_PAGESIZE));
   for (size_t off = 0; off < size; off += ps) {
-    volatile char* c = (volatile char*)p + off;
+    volatile char* c = static_cast<volatile char*>(p) + off;
     *c = 0;
   }
 }
@@ -58,11 +109,12 @@ static inline int mbind_sys(void* addr, unsigned long len, int mode,
   return (rc == -1) ? -errno : 0;
 }
 
-uintptr_t alloc_numa_ptr(size_t size, int node) {
-  void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE,
-                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-  if (ptr == MAP_FAILED)
-    throw std::runtime_error(std::string("mmap failed: ") + strerror(errno));
+static uintptr_t _alloc_numa_impl(size_t size, int node, bool hugepages) {
+  if (hugepages) {
+    assert(size % HUGEPAGE_SIZE == 0);
+  }
+
+  void* ptr = _mmap_anon(size, hugepages);
 
   // Maximum of 64 numa nodes
   unsigned long mask = 1UL << node;
@@ -74,20 +126,25 @@ uintptr_t alloc_numa_ptr(size_t size, int node) {
     throw std::runtime_error(std::string("mbind failed: ") + strerror(err));
   }
 
-  first_touch(ptr, size);
+  first_touch(ptr, size, hugepages);
 
   return reinterpret_cast<uintptr_t>(ptr);
 }
 
+uintptr_t alloc_numa_ptr(size_t size, int node) {
+  return _alloc_numa_impl(size, node, false);
+}
+
 void free_numa_ptr(uintptr_t ptr, size_t size) {
   void* p = reinterpret_cast<void*>(ptr);
   if (munmap(p, size) != 0) {
     throw std::runtime_error(std::string("munmap failed: ") + strerror(errno));
   }
 }
 
-uintptr_t alloc_pinned_numa_ptr(size_t size, int node) {
-  void* ptr = reinterpret_cast<void*>(alloc_numa_ptr(size, node));
+static uintptr_t _alloc_pinned_numa_impl(size_t size, int node,
+                                         bool hugepages) {
+  void* ptr = reinterpret_cast<void*>(_alloc_numa_impl(size, node, hugepages));
 
   cudaError_t st = cudaHostRegister(ptr, size, 0);
   if (st != cudaSuccess) {
@@ -99,6 +156,15 @@ uintptr_t alloc_pinned_numa_ptr(size_t size, int node) {
   return reinterpret_cast<uintptr_t>(ptr);
 }
 
+uintptr_t alloc_pinned_numa_ptr(size_t size, int node) {
+  return _alloc_pinned_numa_impl(size, node, false);
+}
+
+uintptr_t alloc_hugepage_pinned_numa_ptr(size_t size, int node) {
+  size = _align_hugepage(size);
+  return _alloc_pinned_numa_impl(size, node, true);
+}
+
 void free_pinned_numa_ptr(uintptr_t ptr, size_t size) {
   void* p = reinterpret_cast<void*>(ptr);
   // Unpin first, then unmap.
@@ -113,6 +179,11 @@ void free_pinned_numa_ptr(uintptr_t ptr, size_t size) {
   }
 }
 
+void free_hugepage_pinned_numa_ptr(uintptr_t ptr, size_t size) {
+  size = _align_hugepage(size);
+  free_pinned_numa_ptr(ptr, size);
+}
+
 uintptr_t alloc_shm_pinned_ptr(size_t size, const std::string& shm_name) {
   int fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0600);
   if (fd < 0)
@@ -133,7 +204,7 @@ uintptr_t alloc_shm_pinned_ptr(size_t size, const std::string& shm_name) {
     throw std::runtime_error(std::string("mmap failed: ") + strerror(errno));
   }
 
-  first_touch(ptr, size);
+  first_touch(ptr, size, false);
 
   cudaError_t st = cudaHostRegister(ptr, size, 0);
   if (st != cudaSuccess) {

diff --git a/csrc/mem_alloc.h b/csrc/mem_alloc.h
@@ -1,3 +1,5 @@
+// SPDX-License-Identifier: Apache-2.0
+
 #include <cstdint>
 #include <string>
 #include <vector>
@@ -15,3 +17,11 @@ void free_numa_ptr(uintptr_t ptr, size_t size);
 void free_pinned_numa_ptr(uintptr_t ptr, size_t size);
 void free_shm_pinned_ptr(uintptr_t ptr, size_t size,
                          const std::string& shm_name);
+
+// Hugepage variants (MAP_HUGETLB). Not available for shm: /dev/shm usually
+// uses tmpfs, and tmpfs does not support MAP_HUGETLB.
+uintptr_t alloc_hugepage_pinned_ptr(size_t size, unsigned int flags);
+uintptr_t alloc_hugepage_pinned_numa_ptr(size_t size, int node);
+
+void free_hugepage_pinned_ptr(uintptr_t ptr, size_t size);
+void free_hugepage_pinned_numa_ptr(uintptr_t ptr, size_t size);
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
@@ -61,9 +61,15 @@ PYBIND11_MODULE(c_ops, m) {
   m.def("alloc_pinned_ptr", &alloc_pinned_ptr,
         py::call_guard<py::gil_scoped_release>());
   m.def("free_pinned_ptr", &free_pinned_ptr);
+  m.def("alloc_hugepage_pinned_ptr", &alloc_hugepage_pinned_ptr,
+        py::call_guard<py::gil_scoped_release>());
+  m.def("free_hugepage_pinned_ptr", &free_hugepage_pinned_ptr);
   m.def("alloc_pinned_numa_ptr", &alloc_pinned_numa_ptr,
         py::call_guard<py::gil_scoped_release>());
   m.def("free_pinned_numa_ptr", &free_pinned_numa_ptr);
+  m.def("alloc_hugepage_pinned_numa_ptr", &alloc_hugepage_pinned_numa_ptr,
+        py::call_guard<py::gil_scoped_release>());
+  m.def("free_hugepage_pinned_numa_ptr", &free_hugepage_pinned_numa_ptr);
   m.def("alloc_numa_ptr", &alloc_numa_ptr,
         py::call_guard<py::gil_scoped_release>());
   m.def("free_numa_ptr", &free_numa_ptr);

diff --git a/docs/source/api_reference/configurations.rst b/docs/source/api_reference/configurations.rst
@@ -34,6 +34,9 @@ Basic cache settings that control the core functionality of LMCache.
    * - max_local_cpu_size
      - LMCACHE_MAX_LOCAL_CPU_SIZE
      - Maximum CPU cache size in GB. Default: 5.0
+   * - local_cpu_use_hugepages
+     - LMCACHE_LOCAL_CPU_USE_HUGEPAGES
+     - Whether to use Linux hugepages (2 MB) for CPU-pinned KV cache memory. Not compatible with P2P mode or shared memory (multiprocess). Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Values: true/false. Default: false
    * - local_disk
      - LMCACHE_LOCAL_DISK
      - Path (or comma-separated paths) to local disk cache directories. Format: ``"file:///path/to/cache"`` or ``"/path/a,/path/b"`` for multi-device I/O. See ``local_disk_path_sharding`` for how paths are assigned to GPUs.
@@ -354,6 +357,8 @@ Settings for using Nixl as a storage backend instead of disaggregated prefill. T
      - Number of files or objects in the storage pool
    * - nixl_endpoint_list
      - List of object-storage endpoint URLs for per-worker distribution. Each TP worker selects an entry round-robin by ``local_worker_id``, overriding ``nixl_backend_params.endpoint_override``. Only applied when ``nixl_backend`` is ``"OBJ"`` (silently ignored otherwise). Each entry must start with ``http://`` or ``https://``; an empty list raises ``ValueError`` at engine init.
+   * - nixl_use_hugepages
+     - Whether to use Linux hugepages (2 MiB) for the NIXL CPU buffer. Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Values: true/false. Default: false
 
 
 Additional Storage Configurations

diff --git a/docs/source/kv_cache/storage_backends/cpu_ram.rst b/docs/source/kv_cache/storage_backends/cpu_ram.rst
@@ -63,6 +63,56 @@ tokens into the pinned CPU RAM from the disk or remote storage (*if* the KV cach
 tokens are already stored there). This can preemptively avoid the latency of the disk and
 remote KV transfer if we predict these tokens will be requested soon (e.g. structured or agentic workflows).
 
+.. _cpu_ram-hugepage-support:
+
+Hugepage Support
+-----------------
+
+By default LMCache allocates CPU-pinned memory using regular 4 KiB pages.
+For large KV cache buffers (multiple gigabytes), enabling **Linux hugepages**
+(2 MiB pages) can reduce TLB (Translation Lookaside Buffer) pressure and
+improve memory access performance.
+
+**System prerequisite**
+
+Hugepages must be pre-allocated at the OS level before LMCache starts.
+TO find the number of pages needed, divide the desired buffer size by 2 MiB and round up.
+For example, 5 GB requires at least 2560 pages:
+
+.. code-block:: bash
+
+    # Allocate 2560 hugepages (5 GB)
+    sudo sysctl -w vm.nr_hugepages=2560
+
+    # Make persistent across reboots
+    echo 'vm.nr_hugepages=2560' | sudo tee -a /etc/sysctl.conf
+
+Verify that pages are available:
+
+.. code-block:: bash
+
+    grep HugePages /proc/meminfo
+    # HugePages_Total:    2560
+    # HugePages_Free:     2560
+
+**Configuration**
+
+.. code-block:: yaml
+
+    local_cpu_use_hugepages: true
+
+Or via environment variable:
+
+.. code-block:: bash
+
+    export LMCACHE_LOCAL_CPU_USE_HUGEPAGES=true
+
+**Restrictions**
+
+- Hugepages are **not compatible with P2P mode** (``enable_p2p: true``).
+- Hugepages are **not compatible with shared memory** (``shm_name`` is set).
+- On non-CUDA platforms, hugepages are not supported. Regular allocation will be used as fallback.
+
 .. _cpu_ram-online-inference-example:
 
 Online Inference Example

diff --git a/docs/source/kv_cache/storage_backends/nixl.rst b/docs/source/kv_cache/storage_backends/nixl.rst
@@ -37,7 +37,8 @@ Example ``lmcache-config.yaml`` for POSIX backend:
       nixl_backend: POSIX
       nixl_pool_size: 64
       nixl_path: /mnt/nixl/cache/
-      use_direct_io: True
+      use_direct_io: true
+      nixl_use_hugepages: true  # optional, requires pre-allocated hugepages
 
 Key settings:
 
@@ -51,6 +52,8 @@ Key settings:
 
 - ``nixl_backend``: configuration of which nixl backend to use for storage.
 
+- ``nixl_use_hugepages``: whether to use Linux hugepages (2 MiB) for the NIXL CPU buffer. Not supported for GPU buffers. Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Default: ``false``.
+
 .. note::
 
     Supported backends are: ["GDS", "GDS_MT", "POSIX", "HF3FS", "OBJ", "AZURE_BLOB"].