Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .buildkite/k3_tests/unit/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ pytest --maxfail=1 --cov=lmcache \
--cov-report term --cov-report=html:coverage-test \
--cov-report=xml:coverage-test.xml --html=durations/test.html \
--ignore=tests/disagg --ignore=tests/v1/test_pos_kernels.py \
--ignore=tests/v1/test_nixl_storage.py \
--ignore=tests/skipped \
--ignore=tests/v1/storage_backend/test_eic.py

Expand Down
1 change: 0 additions & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ steps:
--cov-report term --cov-report=html:coverage-test \
--cov-report=xml:coverage-test.xml --html=durations/test.html \
--ignore=tests/disagg --ignore=tests/v1/test_pos_kernels.py \
--ignore=tests/v1/test_nixl_storage.py \
--ignore=tests/v1/test_nixl_batched_contains.py \
--ignore=tests/v1/test_device_id_race.py \
--ignore=tests/skipped \
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:

- name: "Run non-CUDA unit tests"
run: |
pytest --ignore=tests/disagg --ignore=tests/v1/test_nixl_storage.py \
pytest --ignore=tests/disagg \
--ignore=tests/v1/multiprocess/ \
--ignore=tests/v1/distributed/ \
--ignore=tests/v1/mp_observability/ \
Expand Down
9 changes: 4 additions & 5 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,10 @@ BUILD_WITH_HIP=1 pip install -e .
```bash
# Run standard test suite (mirrors CI)
pytest -xvs --ignore=tests/disagg \
--ignore=tests/v1/test_nixl_storage.py \
--ignore=tests/v1/multiprocess/ \
--ignore=tests/v1/distributed/ \
--ignore=tests/skipped \
--ignore=tests/v1/storage_backend/test_eic.py
--ignore=tests/v1/multiprocess/ \
--ignore=tests/v1/distributed/ \
--ignore=tests/skipped \
--ignore=tests/v1/storage_backend/test_eic.py

# Run a single test file
pytest -xvs tests/v1/test_cache_engine.py
Expand Down
1 change: 0 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ BUILD_WITH_HIP=1 pip install -e .
```bash
# Run standard test suite (mirrors CI)
pytest -xvs --ignore=tests/disagg \
--ignore=tests/v1/test_nixl_storage.py \
--ignore=tests/v1/multiprocess/ \
--ignore=tests/v1/distributed/ \
--ignore=tests/skipped \
Expand Down
95 changes: 83 additions & 12 deletions csrc/mem_alloc.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,35 @@
#include <cuda_runtime.h>
#include <stdexcept>
#include <string>
#include <cassert>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <linux/mman.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <cstring> // for strerror
#include <linux/mempolicy.h> // for MPOL_BIND, MPOL_MF_MOVE, MPOL_MF_STRICT
#include "mem_alloc.h"

static constexpr size_t HUGEPAGE_SIZE = 2UL * 1024 * 1024; // MAP_HUGE_2MB

static inline size_t _align_hugepage(size_t size) {
return (size + HUGEPAGE_SIZE - 1) & ~(HUGEPAGE_SIZE - 1);
}

static void* _mmap_anon(size_t size, bool hugepages) {
int flags = MAP_PRIVATE | MAP_ANONYMOUS;
if (hugepages) {
flags |= MAP_HUGETLB | MAP_HUGE_2MB;
}
void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, flags, -1, 0);
if (ptr == MAP_FAILED) {
throw std::runtime_error(std::string("mmap failed: ") + strerror(errno));
}
return ptr;
}

uintptr_t alloc_pinned_ptr(size_t size, unsigned int flags) {
void* ptr = nullptr;
cudaError_t err = cudaHostAlloc(&ptr, size, flags);
Expand All @@ -26,6 +46,36 @@ void free_pinned_ptr(uintptr_t ptr) {
}
}

uintptr_t alloc_hugepage_pinned_ptr(size_t size, unsigned int flags) {
size = _align_hugepage(size);
void* ptr = _mmap_anon(size, true);

cudaError_t st = cudaHostRegister(ptr, size, flags);
if (st != cudaSuccess) {
munmap(ptr, size);
throw std::runtime_error(std::string("cudaHostRegister failed: ") +
cudaGetErrorString(st));
}

return reinterpret_cast<uintptr_t>(ptr);
}

void free_hugepage_pinned_ptr(uintptr_t ptr, size_t size) {
size = _align_hugepage(size);
void* p = reinterpret_cast<void*>(ptr);

// Unpin first, then unmap.
cudaError_t st = cudaHostUnregister(p);
if (st != cudaSuccess) {
munmap(p, size);
throw std::runtime_error(std::string("cudaHostUnregister failed: ") +
cudaGetErrorString(st));
}
if (munmap(p, size) != 0) {
throw std::runtime_error(std::string("munmap failed: ") + strerror(errno));
}
}

void batched_memcpy(const std::vector<uintptr_t>& src_ptrs,
const std::vector<uintptr_t>& dst_ptrs,
const std::vector<size_t>& sizes) {
Expand All @@ -43,10 +93,11 @@ void batched_memcpy(const std::vector<uintptr_t>& src_ptrs,
}
}

static void first_touch(void* p, size_t size) {
const long ps = sysconf(_SC_PAGESIZE);
static void first_touch(void* p, size_t size, bool hugepages) {
const size_t ps =
hugepages ? HUGEPAGE_SIZE : static_cast<size_t>(sysconf(_SC_PAGESIZE));
for (size_t off = 0; off < size; off += ps) {
volatile char* c = (volatile char*)p + off;
volatile char* c = static_cast<volatile char*>(p) + off;
*c = 0;
}
}
Expand All @@ -58,11 +109,12 @@ static inline int mbind_sys(void* addr, unsigned long len, int mode,
return (rc == -1) ? -errno : 0;
}

uintptr_t alloc_numa_ptr(size_t size, int node) {
void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr == MAP_FAILED)
throw std::runtime_error(std::string("mmap failed: ") + strerror(errno));
static uintptr_t _alloc_numa_impl(size_t size, int node, bool hugepages) {
if (hugepages) {
assert(size % HUGEPAGE_SIZE == 0);
}

void* ptr = _mmap_anon(size, hugepages);

// Maximum of 64 numa nodes
unsigned long mask = 1UL << node;
Expand All @@ -74,20 +126,25 @@ uintptr_t alloc_numa_ptr(size_t size, int node) {
throw std::runtime_error(std::string("mbind failed: ") + strerror(err));
}

first_touch(ptr, size);
first_touch(ptr, size, hugepages);

return reinterpret_cast<uintptr_t>(ptr);
}

uintptr_t alloc_numa_ptr(size_t size, int node) {
return _alloc_numa_impl(size, node, false);
}

void free_numa_ptr(uintptr_t ptr, size_t size) {
void* p = reinterpret_cast<void*>(ptr);
if (munmap(p, size) != 0) {
throw std::runtime_error(std::string("munmap failed: ") + strerror(errno));
}
}

uintptr_t alloc_pinned_numa_ptr(size_t size, int node) {
void* ptr = reinterpret_cast<void*>(alloc_numa_ptr(size, node));
static uintptr_t _alloc_pinned_numa_impl(size_t size, int node,
bool hugepages) {
void* ptr = reinterpret_cast<void*>(_alloc_numa_impl(size, node, hugepages));

cudaError_t st = cudaHostRegister(ptr, size, 0);
if (st != cudaSuccess) {
Expand All @@ -99,6 +156,15 @@ uintptr_t alloc_pinned_numa_ptr(size_t size, int node) {
return reinterpret_cast<uintptr_t>(ptr);
}

uintptr_t alloc_pinned_numa_ptr(size_t size, int node) {
return _alloc_pinned_numa_impl(size, node, false);
}

uintptr_t alloc_hugepage_pinned_numa_ptr(size_t size, int node) {
size = _align_hugepage(size);
return _alloc_pinned_numa_impl(size, node, true);
}

void free_pinned_numa_ptr(uintptr_t ptr, size_t size) {
void* p = reinterpret_cast<void*>(ptr);
// Unpin first, then unmap.
Expand All @@ -113,6 +179,11 @@ void free_pinned_numa_ptr(uintptr_t ptr, size_t size) {
}
}

void free_hugepage_pinned_numa_ptr(uintptr_t ptr, size_t size) {
size = _align_hugepage(size);
free_pinned_numa_ptr(ptr, size);
}

uintptr_t alloc_shm_pinned_ptr(size_t size, const std::string& shm_name) {
int fd = shm_open(shm_name.c_str(), O_CREAT | O_RDWR, 0600);
if (fd < 0)
Expand All @@ -133,7 +204,7 @@ uintptr_t alloc_shm_pinned_ptr(size_t size, const std::string& shm_name) {
throw std::runtime_error(std::string("mmap failed: ") + strerror(errno));
}

first_touch(ptr, size);
first_touch(ptr, size, false);

cudaError_t st = cudaHostRegister(ptr, size, 0);
if (st != cudaSuccess) {
Expand Down
10 changes: 10 additions & 0 deletions csrc/mem_alloc.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// SPDX-License-Identifier: Apache-2.0

#include <cstdint>
#include <string>
#include <vector>
Expand All @@ -15,3 +17,11 @@ void free_numa_ptr(uintptr_t ptr, size_t size);
void free_pinned_numa_ptr(uintptr_t ptr, size_t size);
void free_shm_pinned_ptr(uintptr_t ptr, size_t size,
const std::string& shm_name);

// Hugepage variants (MAP_HUGETLB). Not available for shm: /dev/shm usually
// uses tmpfs, and tmpfs does not support MAP_HUGETLB.
uintptr_t alloc_hugepage_pinned_ptr(size_t size, unsigned int flags);
uintptr_t alloc_hugepage_pinned_numa_ptr(size_t size, int node);

void free_hugepage_pinned_ptr(uintptr_t ptr, size_t size);
void free_hugepage_pinned_numa_ptr(uintptr_t ptr, size_t size);
6 changes: 6 additions & 0 deletions csrc/pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,15 @@ PYBIND11_MODULE(c_ops, m) {
m.def("alloc_pinned_ptr", &alloc_pinned_ptr,
py::call_guard<py::gil_scoped_release>());
m.def("free_pinned_ptr", &free_pinned_ptr);
m.def("alloc_hugepage_pinned_ptr", &alloc_hugepage_pinned_ptr,
py::call_guard<py::gil_scoped_release>());
m.def("free_hugepage_pinned_ptr", &free_hugepage_pinned_ptr);
m.def("alloc_pinned_numa_ptr", &alloc_pinned_numa_ptr,
py::call_guard<py::gil_scoped_release>());
m.def("free_pinned_numa_ptr", &free_pinned_numa_ptr);
m.def("alloc_hugepage_pinned_numa_ptr", &alloc_hugepage_pinned_numa_ptr,
py::call_guard<py::gil_scoped_release>());
m.def("free_hugepage_pinned_numa_ptr", &free_hugepage_pinned_numa_ptr);
m.def("alloc_numa_ptr", &alloc_numa_ptr,
py::call_guard<py::gil_scoped_release>());
m.def("free_numa_ptr", &free_numa_ptr);
Expand Down
5 changes: 5 additions & 0 deletions docs/source/api_reference/configurations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ Basic cache settings that control the core functionality of LMCache.
* - max_local_cpu_size
- LMCACHE_MAX_LOCAL_CPU_SIZE
- Maximum CPU cache size in GB. Default: 5.0
* - local_cpu_use_hugepages
- LMCACHE_LOCAL_CPU_USE_HUGEPAGES
- Whether to use Linux hugepages (2 MB) for CPU-pinned KV cache memory. Not compatible with P2P mode or shared memory (multiprocess). Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Values: true/false. Default: false
* - local_disk
- LMCACHE_LOCAL_DISK
- Path (or comma-separated paths) to local disk cache directories. Format: ``"file:///path/to/cache"`` or ``"/path/a,/path/b"`` for multi-device I/O. See ``local_disk_path_sharding`` for how paths are assigned to GPUs.
Expand Down Expand Up @@ -354,6 +357,8 @@ Settings for using Nixl as a storage backend instead of disaggregated prefill. T
- Number of files or objects in the storage pool
* - nixl_endpoint_list
- List of object-storage endpoint URLs for per-worker distribution. Each TP worker selects an entry round-robin by ``local_worker_id``, overriding ``nixl_backend_params.endpoint_override``. Only applied when ``nixl_backend`` is ``"OBJ"`` (silently ignored otherwise). Each entry must start with ``http://`` or ``https://``; an empty list raises ``ValueError`` at engine init.
* - nixl_use_hugepages
- Whether to use Linux hugepages (2 MiB) for the NIXL CPU buffer. Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Values: true/false. Default: false


Additional Storage Configurations
Expand Down
50 changes: 50 additions & 0 deletions docs/source/kv_cache/storage_backends/cpu_ram.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,56 @@ tokens into the pinned CPU RAM from the disk or remote storage (*if* the KV cach
tokens are already stored there). This can preemptively avoid the latency of the disk and
remote KV transfer if we predict these tokens will be requested soon (e.g. structured or agentic workflows).

.. _cpu_ram-hugepage-support:

Hugepage Support
-----------------

By default LMCache allocates CPU-pinned memory using regular 4 KiB pages.
For large KV cache buffers (multiple gigabytes), enabling **Linux hugepages**
(2 MiB pages) can reduce TLB (Translation Lookaside Buffer) pressure and
improve memory access performance.

**System prerequisite**

Hugepages must be pre-allocated at the OS level before LMCache starts.
TO find the number of pages needed, divide the desired buffer size by 2 MiB and round up.
For example, 5 GB requires at least 2560 pages:

.. code-block:: bash

# Allocate 2560 hugepages (5 GB)
sudo sysctl -w vm.nr_hugepages=2560

# Make persistent across reboots
echo 'vm.nr_hugepages=2560' | sudo tee -a /etc/sysctl.conf

Verify that pages are available:

.. code-block:: bash

grep HugePages /proc/meminfo
# HugePages_Total: 2560
# HugePages_Free: 2560

**Configuration**

.. code-block:: yaml

local_cpu_use_hugepages: true

Or via environment variable:

.. code-block:: bash

export LMCACHE_LOCAL_CPU_USE_HUGEPAGES=true

**Restrictions**

- Hugepages are **not compatible with P2P mode** (``enable_p2p: true``).
- Hugepages are **not compatible with shared memory** (``shm_name`` is set).
- On non-CUDA platforms, hugepages are not supported. Regular allocation will be used as fallback.

.. _cpu_ram-online-inference-example:

Online Inference Example
Expand Down
5 changes: 4 additions & 1 deletion docs/source/kv_cache/storage_backends/nixl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ Example ``lmcache-config.yaml`` for POSIX backend:
nixl_backend: POSIX
nixl_pool_size: 64
nixl_path: /mnt/nixl/cache/
use_direct_io: True
use_direct_io: true
nixl_use_hugepages: true # optional, requires pre-allocated hugepages

Key settings:

Expand All @@ -51,6 +52,8 @@ Key settings:

- ``nixl_backend``: configuration of which nixl backend to use for storage.

- ``nixl_use_hugepages``: whether to use Linux hugepages (2 MiB) for the NIXL CPU buffer. Not supported for GPU buffers. Requires pre-allocated hugepages (``sysctl vm.nr_hugepages``). Default: ``false``.

.. note::

Supported backends are: ["GDS", "GDS_MT", "POSIX", "HF3FS", "OBJ", "AZURE_BLOB"].
Expand Down
Loading
Loading