diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 71d94ebd8..c50ffba1c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -242,6 +242,23 @@ def __init__(
             )  # keep a reference to the array so it is not gc'd
             self.model_params.tensor_split = self._c_tensor_split
         self.model_params.vocab_only = vocab_only
+
+        # When all layers are offloaded to GPU (n_gpu_layers == -1), disable mmap
+        # to prevent the memory-mapped model file from staying resident in RAM.
+        # With mmap enabled, the entire model file remains in the page cache even
+        # after weights are copied to VRAM. Disabling mmap causes llama.cpp to use
+        # a temporary read buffer that is freed after GPU upload.
+        # See: https://github.com/abetlen/llama-cpp-python/issues/1964
+        if n_gpu_layers == -1 and use_mmap and llama_cpp.llama_supports_gpu_offload():
+            if self.verbose:
+                print(
+                    "Automatically disabling mmap because all layers are offloaded "
+                    "to GPU (n_gpu_layers=-1). This reduces host RAM usage. "
+                    "Set use_mmap=True explicitly to override this behavior.",
+                    file=sys.stderr,
+                )
+            use_mmap = False
+
         self.model_params.use_mmap = use_mmap if lora_path is None else False
         self.model_params.use_mlock = use_mlock
 
diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py
index 13c951241..f6dc85786 100644
--- a/llama_cpp/server/settings.py
+++ b/llama_cpp/server/settings.py
@@ -48,7 +48,8 @@ class ModelSettings(BaseSettings):
     )
     use_mmap: bool = Field(
         default=llama_cpp.llama_supports_mmap(),
-        description="Use mmap.",
+        description="Use mmap. When n_gpu_layers is -1 (full GPU offload), mmap is automatically "
+        "disabled to reduce host RAM usage unless explicitly set to True.",
     )
     use_mlock: bool = Field(
         default=llama_cpp.llama_supports_mlock(),
diff --git a/tests/test_mmap_gpu_offload.py b/tests/test_mmap_gpu_offload.py
new file mode 100644
index 000000000..fa725771c
--- /dev/null
+++ b/tests/test_mmap_gpu_offload.py
@@ -0,0 +1,75 @@
+"""Tests for automatic mmap disabling when all layers are offloaded to GPU.
+
+See: https://github.com/abetlen/llama-cpp-python/issues/1964
+"""
+
+import sys
+from unittest.mock import MagicMock
+from dataclasses import dataclass, field
+
+# Stub the native C library so tests can run without compiling llama.cpp
+_mock_llama_cpp = MagicMock()
+_mock_llama_cpp.llama_log_callback = lambda f: f
+_mock_llama_cpp.llama_log_set = MagicMock()
+sys.modules.setdefault("llama_cpp.llama_cpp", _mock_llama_cpp)
+
+_mock_llama = MagicMock()
+_mock_llama.StoppingCriteriaList = list
+_mock_llama.LogitsProcessorList = list
+_mock_llama.LlamaGrammar = MagicMock
+sys.modules.setdefault("llama_cpp.llama", _mock_llama)
+
+
+@dataclass
+class MockModelParams:
+    """Mimics the relevant fields of llama_model_params for testing."""
+    n_gpu_layers: int = 0
+    use_mmap: bool = True
+
+
+def _apply_mmap_logic(n_gpu_layers: int, use_mmap: bool, gpu_offload_supported: bool) -> bool:
+    """Replicate the mmap auto-disable logic from Llama.__init__."""
+    if n_gpu_layers == -1 and use_mmap and gpu_offload_supported:
+        return False
+    return use_mmap
+
+
+def test_mmap_disabled_when_all_layers_offloaded():
+    """When n_gpu_layers=-1 and GPU offload is supported, use_mmap should be set to False."""
+    result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=True)
+    assert result is False
+
+
+def test_mmap_kept_when_partial_offload():
+    """When n_gpu_layers is not -1, use_mmap should remain True."""
+    result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=True, gpu_offload_supported=True)
+    assert result is True
+
+
+def test_mmap_kept_when_no_gpu_support():
+    """When GPU offload is not supported, use_mmap should remain True even with n_gpu_layers=-1."""
+    result = _apply_mmap_logic(n_gpu_layers=-1, use_mmap=True, gpu_offload_supported=False)
+    assert result is True
+
+
+def test_mmap_kept_when_zero_gpu_layers():
+    """When n_gpu_layers=0, use_mmap should remain True (CPU-only inference)."""
+    result = _apply_mmap_logic(n_gpu_layers=0, use_mmap=True, gpu_offload_supported=True)
+    assert result is True
+
+
+def test_mmap_respects_explicit_false():
+    """When user explicitly sets use_mmap=False, it should stay False regardless."""
+    result = _apply_mmap_logic(n_gpu_layers=10, use_mmap=False, gpu_offload_supported=True)
+    assert result is False
+
+
+def test_mmap_disabled_applies_to_params():
+    """Verify the logic correctly updates a MockModelParams object."""
+    params = MockModelParams(n_gpu_layers=-1, use_mmap=True)
+    params.use_mmap = _apply_mmap_logic(
+        n_gpu_layers=params.n_gpu_layers,
+        use_mmap=params.use_mmap,
+        gpu_offload_supported=True,
+    )
+    assert params.use_mmap is False