llama_cpp/docker-compose.example.yml at master · nerdpudding/llama_cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# =============================================================================
# llama.cpp server -- RTX 4090 + RTX 5070 Ti
#
# HOW TO USE:
#
# 1. Run the model selector (recommended):
#      ./start.sh                    # Interactive menu + monitoring dashboard
#      ./start.sh qwen3-coder        # Direct launch by model ID
#      ./start.sh --no-dashboard     # Launch without dashboard (raw logs)
#
#    start.sh reads models.conf, generates .env, starts the container, and
#    opens a curses dashboard with server logs, GPU/system monitoring, and
#    keyboard controls ([q] stop & exit, [r] stop & return to menu).
#
# 2. Or manually:
#      Edit .env (or copy from .env.example) and run: docker compose up
#
# 3. Access:
#      Web UI:  http://localhost:8080
#      API:     http://localhost:8080/v1/chat/completions
#
# MODELS:
#   All models are defined in models.conf. Run ./start.sh --list to see them.
#
# VARIABLE REFERENCE:
#
#   MODEL           Path to .gguf file relative to models/ dir
#   CTX_SIZE        Context window size in tokens
#   N_GPU_LAYERS    Number of layers to offload to GPU (99 = all)
#   SPLIT_MODE      Multi-GPU split: layer (default), row, none
#   TENSOR_SPLIT    Manual split ratio (blank = auto)
#   MAIN_GPU        Primary GPU index (0 = RTX 4090)
#   FLASH_ATTN      Flash attention (1 = on)
#   KV_CACHE_TYPE_K KV cache key type (q8_0 recommended)
#   KV_CACHE_TYPE_V KV cache value type (q8_0 recommended)
#   FIT             Auto-fit VRAM (on/off, use off with -ot)
#   FIT_TARGET      VRAM headroom in MiB per device
#   FIT_CTX         Minimum context size for auto-fit
#   EXTRA_ARGS      Any additional llama-server flags
#
# =============================================================================

services:
  llama-server:
    build: .
    container_name: llama-server
    ports:
      - "8080:8080"
    volumes:
      - ./models:/models:ro
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      # ---- Model ----
      - MODEL=/models/${MODEL:-model.gguf}
      # ---- Context ----
      - CTX_SIZE=${CTX_SIZE:-131072}
      # ---- GPU offload ----
      - N_GPU_LAYERS=${N_GPU_LAYERS:-99}
      # ---- Split mode: layer (default), row, none ----
      - SPLIT_MODE=${SPLIT_MODE:-layer}
      # ---- Manual tensor split ratio (blank = use --fit auto) ----
      - TENSOR_SPLIT=${TENSOR_SPLIT:-}
      # ---- Main GPU (0 = RTX 4090, 1 = RTX 5070 Ti) ----
      - MAIN_GPU=${MAIN_GPU:-0}
      # ---- Flash attention ----
      - FLASH_ATTN=${FLASH_ATTN:-1}
      # ---- KV cache quantization ----
      - KV_CACHE_TYPE_K=${KV_CACHE_TYPE_K:-q8_0}
      - KV_CACHE_TYPE_V=${KV_CACHE_TYPE_V:-q8_0}
      # ---- Auto-fit (on by default, adjusts layers/ctx to fit VRAM) ----
      - FIT=${FIT:-on}
      # ---- Fit target: VRAM headroom in MiB per device ----
      # Single value applies to all devices equally (e.g. 1024).
      # Comma-separated for per-device control (e.g. 128,1024 for 2 GPUs).
      # For asymmetric setups: use a small value for dedicated GPUs and a larger
      # value for GPUs that share VRAM with the OS/display stack.
      # Example: 128,1024 = 128 MiB for dedicated CUDA0, 1024 MiB for CUDA1 (display)
      - FIT_TARGET=${FIT_TARGET:-128,1024}
      # ---- Fit minimum context size ----
      - FIT_CTX=${FIT_CTX:-}
      # ---- Extra flags (pass anything not covered above) ----
      - EXTRA_ARGS=${EXTRA_ARGS:-}
    healthcheck:
      test: ["CMD", "curl", "-sf", "http://localhost:8080/health"]
      interval: 10s
      timeout: 5s
      retries: 60
      start_period: 120s
    restart: unless-stopped