diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 71d94ebd8..c50ffba1c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -242,6 +242,23 @@ def __init__( ) # keep a reference to the array so it is not gc'd self.model_params.tensor_split = self._c_tensor_split self.model_params.vocab_only = vocab_only + + # When all layers are offloaded to GPU (n_gpu_layers == -1), disable mmap + # to prevent the memory-mapped model file from staying resident in RAM. + # With mmap enabled, the entire model file remains in the page cache even + # after weights are copied to VRAM. Disabling mmap causes llama.cpp to use + # a temporary read buffer that is freed after GPU upload. + # See: https://github.com/abetlen/llama-cpp-python/issues/1964 + if n_gpu_layers == -1 and use_mmap and llama_cpp.llama_supports_gpu_offload(): + if self.verbose: + print( + "Automatically disabling mmap because all layers are offloaded " + "to GPU (n_gpu_layers=-1). This reduces host RAM usage. " + "Set use_mmap=True explicitly to override this behavior.", + file=sys.stderr, + ) + use_mmap = False + self.model_params.use_mmap = use_mmap if lora_path is None else False self.model_params.use_mlock = use_mlock diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index 13c951241..f6dc85786 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -48,7 +48,8 @@ class ModelSettings(BaseSettings): ) use_mmap: bool = Field( default=llama_cpp.llama_supports_mmap(), - description="Use mmap.", + description="Use mmap. When n_gpu_layers is -1 (full GPU offload), mmap is automatically " + "disabled to reduce host RAM usage unless explicitly set to True.", ) use_mlock: bool = Field( default=llama_cpp.llama_supports_mlock(), diff --git a/tests/test_mmap_gpu_offload.py b/tests/test_mmap_gpu_offload.py new file mode 100644 index 000000000..fa725771c --- /dev/null +++ b/tests/test_mmap_gpu_offload.py @@ -0,0 +1,75 @@ +"""Tests for automatic mmap disabling when all layers are offloaded to GPU. + +See: https://github.com/abetlen/llama-cpp-python/issues/1964 +""" + +import sys +from unittest.mock import MagicMock +from dataclasses import dataclass, field + +# Stub the native C library so tests can run without compiling llama.cpp +_mock_llama_cpp = MagicMock() +_mock_llama_cpp.llama_log_callback = lambda f: f +_mock_llama_cpp.llama_log_set = MagicMock() +sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp) + +_mock_llama = MagicMock() +_mock_llama.StoppingCriteriaList = list +_mock_llama.LogitsProcessorList = list +_mock_llama.LlamaGrammar = MagicMock +sys.modules.setdefault("llama_cpp.llama", _mock_llama) + + +@dataclass +class MockModelParams: + """Mimics the relevant fields of llama_model_params for testing.""" + n_gpu_layers: int = 0 + use_mmap: bool = True + + +def _apply_mmap_logic(n_gpu_layers: int, use_mmap: bool, gpu_offload_supported: bool) -> bool: + """Replicate the mmap auto-disable logic from Llama.__init__.""" + if n_gpu_layers == -1 and use_mmap and gpu_offload_supported: + return False + return use_mmap + + +def test_mmap_disabled_when_all_layers_offloaded(): + """When n_gpu_layers=-1 and GPU offload is supported, use_mmap should be set to False.""" + result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=True) + assert result is False + + +def test_mmap_kept_when_partial_offload(): + """When n_gpu_layers is not -1, use_mmap should remain True.""" + result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=True, gpu_offload_supported=True) + assert result is True + + +def test_mmap_kept_when_no_gpu_support(): + """When GPU offload is not supported, use_mmap should remain True even with n_gpu_layers=-1.""" + result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=False) + assert result is True + + +def test_mmap_kept_when_zero_gpu_layers(): + """When n_gpu_layers=0, use_mmap should remain True (CPU-only inference).""" + result = _apply_mmap_logic(n_gpu_layers=0, use_mmap=True, gpu_offload_supported=True) + assert result is True + + +def test_mmap_respects_explicit_false(): + """When user explicitly sets use_mmap=False, it should stay False regardless.""" + result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=False, gpu_offload_supported=True) + assert result is False + + +def test_mmap_disabled_applies_to_params(): + """Verify the logic correctly updates a MockModelParams object.""" + params = MockModelParams(n_gpu_layers=-1, use_mmap=True) + params.use_mmap = _apply_mmap_logic( + n_gpu_layers=params.n_gpu_layers, + use_mmap=params.use_mmap, + gpu_offload_supported=True, + ) + assert params.use_mmap is False