llama_cpp/models.conf at master · nerdpudding/llama_cpp · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
# =============================================================================
# models.conf — Server configuration for llama.cpp Docker wrapper
#
# This file controls HOW THE SERVER STARTS: model file, context size, GPU
# layers, and llama-server flags. start.sh and benchmark.sh read this file
# and generate .env → docker-compose.yml → llama-server.
#
# Client-side settings (system prompts, reasoning levels, etc.) are NOT here.
# For benchmarks, see benchmarks/evalplus/bench-client.conf.
# For interactive use, set system prompts in your client (web UI, API calls).
#
# Required fields: MODEL
# Optional fields: CTX_SIZE, N_GPU_LAYERS, FIT, EXTRA_ARGS
# (docker-compose.yml defaults apply when omitted)
#
# GPU placement: --fit on (default) with --n-gpu-layers auto (default)
# handles GPU/CPU distribution automatically, including MoE expert offload.
# Do NOT use -ot for GPU device assignments — use --fit instead.
# See https://github.com/ggml-org/llama.cpp/issues/19816 for details.
#
# Section IDs are used as CLI shortcuts: ./start.sh glm-flash-q4
# =============================================================================


# -----------------------------------------------------------------------------
# GLM-4.7 Flash — 30B-A3B MoE (64 experts, 4 active + 1 shared per layer)
#
# 47 layers (1 dense lead + 46 MoE). 128K native context window.
# MLA (Multi-head Latent Attention): compact KV cache (~3.5 GB at 128K).
# Q4 (18 GB) fits entirely on RTX 4090 even at 128K.
# Q8 (30 GB) needs both GPUs — FIT auto-distributes.
#
# Architecture source: model card (models/documentation/README_modelcard_GLM-4.7-Flash.md)
# -----------------------------------------------------------------------------

# Q4: fits on single RTX 4090. --split-mode none prevents unnecessary
# distribution to CUDA1. ~147 t/s.
# Note: --split-mode none in EXTRA_ARGS overrides the docker-compose default
# (--split-mode layer). --main-gpu 0 in EXTRA_ARGS is redundant with the
# docker-compose MAIN_GPU=0 default, but is kept here for clarity. Both
# docker-compose defaults are needed for other models that do NOT set
# --split-mode none, so they cannot be removed globally.
[glm-flash-q4]
NAME=GLM-4.7 Flash Q4_K_M
DESCRIPTION=Fast general tasks, reasoning, tool calling
SPEED=~147 t/s
MODEL=GLM-4.7-Flash/GLM-4.7-Flash-Q4_K_M.gguf
CTX_SIZE=131072
EXTRA_ARGS=--jinja -np 1 --temp 1.0 --top-p 0.95 --min-p 0.01 --split-mode none --main-gpu 0

# Q8: 30 GB, tight fit across both GPUs. With default FIT_TARGET (128,1024
# from docker-compose.yml, tuned for this hardware), FIT keeps everything
# on GPU without unnecessary expert offload.
# Previous: explicit -ot regex splits, FIT=off, N_GPU_LAYERS=99. ~105 t/s.
[glm-flash-q8]
NAME=GLM-4.7 Flash Q8_0
DESCRIPTION=Higher quality reasoning and tool calling
SPEED=~112 t/s
MODEL=GLM-4.7-Flash/GLM-4.7-Flash-Q8_0.gguf
CTX_SIZE=131072
EXTRA_ARGS=--jinja -np 1 --temp 1.0 --top-p 0.95 --min-p 0.01

# Q8 experimental: same as Q8, different model file.
[glm-flash-exp]
NAME=GLM-4.7 Flash Q8_0 (experimental)
DESCRIPTION=Experimental Q8 (same as glm-flash-q8)
SPEED=~112 t/s
MODEL=GLM-4.7-Flash/other/GLM-4.7-Flash-experimental.Q8_0.gguf
CTX_SIZE=131072
EXTRA_ARGS=--jinja -np 1 --temp 1.0 --top-p 0.95 --min-p 0.01


# =============================================================================
# RETIRED MODELS (2026-02-26)
#
# Replaced by Qwen3.5 family after benchmark comparison:
# - GPT-OSS 120B: outclassed by Qwen3.5-122B (94.5% vs 87.2% HE+, similar speed)
# - Qwen3-Coder-Next: matched by Qwen3.5-35B (90.9% HE+ each, 120 vs 33 t/s)
# - Qwen3-Next-80B: replaced by Qwen3.5-122B (94.5% vs 93.9% HE+)
#
# Commented out, not deleted — uncomment to reactivate if model files are present.
# Benchmark data and REPORT.md scores are preserved.
# =============================================================================

# # -----------------------------------------------------------------------------
# # GPT-OSS 120B F16 (MXFP4 native) — MoE with CPU expert offloading
# # 116.8B total, ~5.1B active. 36 layers. 128 experts, 4 active.
# # Reasoning levels via system prompt: "Reasoning: low/medium/high"
# # Previous: ~22 t/s. HumanEval+ 87.2%.
# # -----------------------------------------------------------------------------
#
# [gpt-oss-120b]
# NAME=GPT-OSS 120B F16
# DESCRIPTION=Deep reasoning, knowledge — set "Reasoning: low/high" in system prompt (medium is default)
# SPEED=~22 t/s
# MODEL=GPT-OSS-120b/gpt-oss-120b-F16.gguf
# CTX_SIZE=131072
# EXTRA_ARGS=--jinja -np 1 --temp 1.0 --top-p 1.0 --top-k 0

# # -----------------------------------------------------------------------------
# # Qwen3-Coder-Next — 80B MoE (512 experts, 10 active, ~3B active/token)
# # 48 layers. DeltaNet hybrid. UD-Q5 recommended (93.9% HE, 90.9% HE+).
# # Previous: ~33 t/s.
# # -----------------------------------------------------------------------------
#
# [qwen3-coder-ud-q5]
# NAME=Qwen3-Coder-Next UD-Q5_K_XL
# DESCRIPTION=Coding agents, agentic tasks
# SPEED=~33 t/s
# MODEL=Qwen3-Coder-Next/UD-Q5_K_XL/Qwen3-Coder-Next-UD-Q5_K_XL-00001-of-00003.gguf
# CTX_SIZE=262144
# EXTRA_ARGS=--jinja -np 1 --no-context-shift --temp 1.0 --top-p 0.95 --top-k 40 --min-p 0.01

# # -----------------------------------------------------------------------------
# # Qwen3-Next-80B-A3B — 80B MoE (512 experts, 10+1 shared, ~3B active/token)
# # Non-thinking model. DeltaNet hybrid. 98.2% HE, 93.9% HE+.
# # Previous: ~33 t/s, 55 graph splits.
# # -----------------------------------------------------------------------------
#
# [qwen3-next-ud-q5]
# NAME=Qwen3-Next-80B-A3B UD-Q5_K_XL
# DESCRIPTION=General-purpose reasoning, knowledge, agentic tasks, ultra-long context (262K)
# SPEED=~33 t/s
# MODEL=Qwen3-Next/UD-Q5_K_XL/Qwen3-Next-80B-A3B-Instruct-UD-Q5_K_XL-00001-of-00002.gguf
# CTX_SIZE=262144
# EXTRA_ARGS=--jinja -np 1 --no-context-shift --temp 0.7 --top-p 0.8 --top-k 20 --min-p 0


# -----------------------------------------------------------------------------
# Qwen3.5-35B-A3B — 35B MoE (256 experts, 8 routed + 1 shared, ~3B active/token)
#
# 40 layers. Hybrid DeltaNet architecture identical to Qwen3-Next family:
# 75% Gated DeltaNet (30/40 layers, linear attention, no KV cache) +
# 25% Gated Attention (10/40 layers, standard attention with KV cache).
# Layout: 10 x (3 x (Gated DeltaNet -> MoE) -> 1 x (Gated Attention -> MoE))
#
# Expert dimensions are small (intermediate=512, hidden=2048) — many experts
# but each is compact. 256 experts per layer, 8 routed + 1 shared active.
#
# File size: 29 GiB (~31.1 GB). Does NOT fit on single RTX 4090 (24 GB) →
# FIT auto distributes across CUDA0 + CUDA1 + CPU. Similar total size to
# GLM Q8 (30 GB) but different architecture (DeltaNet + more experts).
# Only 10/40 layers have KV cache, so 262K context is feasible.
#
# --no-context-shift required for DeltaNet (linear attention with recurrent
# state — context shifting would corrupt state).
#
# Tested 2026-02-26: ~120 t/s at 262K context, 3 graph splits.
# CUDA0 ~24 GB (97%), CUDA1 ~12 GB (75%). KV cache 2720 MiB (10 layers).
# Faster than GLM Q8 (112 t/s) due to fewer graph splits (3 vs 5).
#
# Qwen3.5 is a THINKING MODEL — generates <think> blocks by default.
# Does NOT support /think or /nothink soft switches (unlike Qwen3).
# Thinking mode disabled only via chat template parameter (enable_thinking=false).
# Sampler settings: thinking-general profile from model card.
# presence_penalty=1.5 recommended but must be set client-side (not a server flag).
#
# Architecture source: model card (models/documentation/README_Qwen3.5-35B-A3B-GGUF.md)
# -----------------------------------------------------------------------------

[qwen35-35b-q6]
NAME=Qwen3.5-35B-A3B UD-Q6_K_XL
DESCRIPTION=Thinking model — reasoning, coding, multilingual, agentic (DeltaNet MoE)
SPEED=~120 t/s
MODEL=Qwen3.5/MoE/35B/UD6_K_XL/Qwen3.5-35B-A3B-UD-Q6_K_XL.gguf
CTX_SIZE=262144
EXTRA_ARGS=--jinja -np 1 --no-context-shift --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0


# -----------------------------------------------------------------------------
# Qwen3.5-122B-A10B — 122B MoE (256 experts, 8 routed + 1 shared, ~10B active/token)
#
# 48 layers. Hybrid DeltaNet architecture matching Qwen3.5-35B-A3B family:
# 75% Gated DeltaNet (36/48 layers, linear attention, no KV cache) +
# 25% Gated Attention (12/48 layers, standard attention with KV cache).
# Layout: 16 x (3 x (Gated DeltaNet -> MoE) -> 1 x (Gated Attention -> MoE))
#
# Expert dimensions: intermediate=1024, hidden=3072. 256 experts per layer,
# 8 routed + 1 shared active. 10B active per token — roughly 2x GPT-OSS (5.1B)
# and 3.3x Qwen3.5-35B (3B).
#
# File size: ~65 GiB (~68.4 GB) across 3 GGUF parts (part 1 is 11 MiB metadata).
# Far exceeds both GPUs combined (~36 GB usable) -> FIT auto with heavy CPU
# expert offload. Similar offload pattern to GPT-OSS 120B (61 GB) and
# Qwen3-Coder-Next (57 GB).
# Only 12/48 layers have KV cache, so 262K context is feasible.
#
# Tested 2026-02-26: ~18 t/s at 262K context, 65 graph splits (bs=1).
# CUDA0 ~21 GB model + 1 GB KV + 1 GB compute = ~24 GB (96%).
# CUDA1 ~8 GB model + 2 GB KV + 0.7 GB compute = ~14 GB (88%).
# CPU: ~64.5 GiB model buffers (expert offload). RAM tight — ~4 GB free, swap used.
# Container: 55.9 GiB / 62.7 GiB.
#
# --no-context-shift required for DeltaNet (linear attention with recurrent
# state — context shifting would corrupt state).
#
# Qwen3.5 is a THINKING MODEL — generates <think> blocks by default.
# Does NOT support /think or /nothink soft switches (unlike Qwen3).
# Thinking mode disabled only via chat template parameter (enable_thinking=false).
# Sampler settings: thinking-general profile from model card.
# presence_penalty=1.5 recommended but must be set client-side (not a server flag).
#
# Architecture source: model card (models/documentation/README_Qwen3.5-122B-A10B-GGUF.md)
# -----------------------------------------------------------------------------

[qwen35-122b-q4]
NAME=Qwen3.5-122B-A10B UD-Q4_K_XL
DESCRIPTION=Thinking model — deep reasoning, coding, multilingual, agentic (DeltaNet MoE, 10B active)
SPEED=~18 t/s
MODEL=Qwen3.5/MoE/122B/US_Q4_K_XL/Qwen3.5-122B-A10B-UD-Q4_K_XL-00001-of-00003.gguf
CTX_SIZE=262144
EXTRA_ARGS=--jinja -np 1 --no-context-shift --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0


# -----------------------------------------------------------------------------
# Qwen3.5-27B — DENSE 27B model (NOT MoE — all 27B params active every token)
#
# 64 layers. Hybrid DeltaNet architecture matching the Qwen3.5 family:
# 75% Gated DeltaNet (48/64 layers, linear attention, no KV cache) +
# 25% Gated Attention (16/64 layers, standard attention with KV cache).
# Layout: 16 x (3 x (Gated DeltaNet -> FFN) -> 1 x (Gated Attention -> FFN))
#
# IMPORTANT: Despite the HuggingFace tag "qwen3_5_moe", this model uses FFN
# (intermediate=17408, hidden=4096) — NOT MoE. There are no experts, no router.
# All 27B parameters are active on every token. This makes it significantly more
# compute-intensive per token than MoE siblings (35B-A3B, 122B-A10B) that only
# activate ~3-10B per token.
#
# File size: 31 GiB (~33.3 GB), single GGUF. Does NOT fit on single RTX 4090
# (24 GB) -> FIT auto distributes across CUDA0 + CUDA1.
# With 262K context and KV cache (16 attention layers), some spillover to CPU
# is likely. Dense model means FIT will use simple layer-split distribution
# (no expert offload needed) — expect few graph splits (~2-3, similar to dense
# model behavior documented in gpu-strategy-guide.md).
#
# --no-context-shift required for DeltaNet (linear attention with recurrent
# state — context shifting would corrupt state).
#
# Qwen3.5 is a THINKING MODEL — generates <think> blocks by default.
# Does NOT support /think or /nothink soft switches (unlike Qwen3).
# Thinking mode disabled only via chat template parameter (enable_thinking=false).
# Sampler settings: thinking-general profile from model card.
# presence_penalty=1.5 recommended but must be set client-side (not a server flag).
#
# Architecture source: model card (models/documentation/CANDIDATES/README_Qwen3.5-27B-GGUF.md)
#
# Q8 (31 GiB): tested 2026-02-26. CUDA crash fixed by upstream update (PR #19866).
# Loads and runs but only ~6 t/s — 5.7 GB of model weights spill to CPU because
# KV cache (8.7 GB for 16 layers at 262K) fills VRAM. Dense = all 27B active/token,
# so CPU spill is devastating for speed.
#
# Q6 (22 GiB): tested 2026-02-26. ~31 t/s at 262K context. CUDA0 81%, CUDA1 90%.
# Only 1.3 GiB model on CPU (vs 5.7 GiB with Q8). 5x faster than Q8.
# Q8 kept on disk as fallback.
# -----------------------------------------------------------------------------

[qwen35-27b-q6]
NAME=Qwen3.5-27B UD-Q6_K_XL
DESCRIPTION=Thinking model — dense 27B reasoning, coding, multilingual, ultra-long context (DeltaNet hybrid)
SPEED=~31 t/s
MODEL=Qwen3.5/Dense/27B-UD-Q6_K_XL/Qwen3.5-27B-UD-Q6_K_XL.gguf
CTX_SIZE=262144
EXTRA_ARGS=--jinja -np 1 --no-context-shift --temp 1.0 --top-p 0.95 --top-k 20 --min-p 0


# =============================================================================
# Benchmark profiles (EvalPlus HumanEval+)
#
# IMPORTANT: These profiles were converted from explicit -ot splits to FIT auto
# on 2026-02-23. Previous benchmark results in REPORT.md were run with the old
# -ot configurations. A re-benchmark is needed to update the scores — results
# may differ due to different GPU placement and graph splits.
#
# Previous benchmark configs are preserved in comments for reference.
#
# Key settings for benchmarks:
#   - CTX_SIZE=10240 (HumanEval worst case is ~8.4K tokens)
#   - No sampler args — evalplus sends temperature=0 via API (greedy decoding)
#   - --reasoning-format none for thinking models (GLM)
#
# Client-side config (system prompts) is in benchmarks/evalplus/bench-client.conf.
# Used by: benchmarks/evalplus/benchmark.sh
# =============================================================================


# --- GLM-4.7 Flash benchmarks ---
# Q4: fits on single RTX 4090.
# Previous: N_GPU_LAYERS=99, FIT=off, --split-mode none. ~140 t/s.
[bench-glm-flash-q4]
NAME=GLM-4.7 Flash Q4_K_M (benchmark)
MODEL=GLM-4.7-Flash/GLM-4.7-Flash-Q4_K_M.gguf
CTX_SIZE=10240
EXTRA_ARGS=--jinja -np 1 --reasoning-format none --split-mode none --main-gpu 0

# Q8: FIT auto-distributes across both GPUs.
# Previous: N_GPU_LAYERS=99, FIT=off, -ot blk.0-34=CUDA0,blk.35-46=CUDA1. ~105 t/s.
[bench-glm-flash-q8]
NAME=GLM-4.7 Flash Q8_0 (benchmark)
MODEL=GLM-4.7-Flash/GLM-4.7-Flash-Q8_0.gguf
CTX_SIZE=10240
EXTRA_ARGS=--jinja -np 1 --reasoning-format none


# --- Retired bench profiles (2026-02-26) — scores preserved in REPORT.md ---
#
# [bench-gpt-oss-120b]
# NAME=GPT-OSS 120B F16 (benchmark)
# MODEL=GPT-OSS-120b/gpt-oss-120b-F16.gguf
# CTX_SIZE=10240
# EXTRA_ARGS=--jinja -np 1
#
# [bench-qwen3-coder-ud-q5]
# NAME=Qwen3-Coder-Next UD-Q5_K_XL (benchmark)
# MODEL=Qwen3-Coder-Next/UD-Q5_K_XL/Qwen3-Coder-Next-UD-Q5_K_XL-00001-of-00003.gguf
# CTX_SIZE=10240
# EXTRA_ARGS=--jinja -np 1 --no-context-shift
#
# [bench-qwen3-next-ud-q5]
# NAME=Qwen3-Next-80B-A3B UD-Q5_K_XL (benchmark)
# MODEL=Qwen3-Next/UD-Q5_K_XL/Qwen3-Next-80B-A3B-Instruct-UD-Q5_K_XL-00001-of-00002.gguf
# CTX_SIZE=10240
# EXTRA_ARGS=--jinja -np 1 --no-context-shift


# --- Qwen3.5-35B-A3B benchmark ---
# Thinking model — --reasoning-format none forces think blocks into content field
# so postprocessor can strip them. --no-context-shift required for DeltaNet.
# At 10K context, 30 GB model likely fits entirely on both GPUs with FIT.
[bench-qwen35-35b-q6]
NAME=Qwen3.5-35B-A3B UD-Q6_K_XL (benchmark)
MODEL=Qwen3.5/MoE/35B/UD6_K_XL/Qwen3.5-35B-A3B-UD-Q6_K_XL.gguf
CTX_SIZE=10240
EXTRA_ARGS=--jinja -np 1 --reasoning-format none --no-context-shift


# --- Qwen3.5-122B-A10B benchmark ---
# Thinking model — --reasoning-format none forces think blocks into content field.
# --no-context-shift required for DeltaNet. FIT auto with CPU expert offload.
[bench-qwen35-122b-q4]
NAME=Qwen3.5-122B-A10B UD-Q4_K_XL (benchmark)
MODEL=Qwen3.5/MoE/122B/US_Q4_K_XL/Qwen3.5-122B-A10B-UD-Q4_K_XL-00001-of-00003.gguf
CTX_SIZE=10240
EXTRA_ARGS=--jinja -np 1 --reasoning-format none --no-context-shift


# --- Qwen3.5-27B benchmark (dense) ---
# Thinking model — --reasoning-format none forces think blocks into content field.
# --no-context-shift required for DeltaNet. Dense model, FIT across both GPUs.
# At 10K context, 32 GB model may fit entirely on both GPUs (no CPU spill).
[bench-qwen35-27b-q6]
NAME=Qwen3.5-27B UD-Q6_K_XL (benchmark)
MODEL=Qwen3.5/Dense/27B-UD-Q6_K_XL/Qwen3.5-27B-UD-Q6_K_XL.gguf
CTX_SIZE=10240
EXTRA_ARGS=--jinja -np 1 --reasoning-format none --no-context-shift


# =============================================================================
# Issue #19816 test profiles (kept for reference)
# See https://github.com/ggml-org/llama.cpp/issues/19816
#
# These tests led to the discovery that N_GPU_LAYERS=99 (hardcoded in Dockerfile
# and docker-compose.yml) prevented FIT from working correctly. Changing the
# default to 'auto' resolved the issue — FIT now handles GPU/CPU distribution
# automatically, including MoE expert offload.
#
# ggerganov suggested -ts instead of -ot for GPU splits.
# Result: works but GPU barely used (~960 MiB CUDA0, ~647 MiB CUDA1),
# ~19.8 t/s at 262K. -ts only distributes non-expert data after
# -ot exps=CPU moves 47 GB to CPU.
#[test-ts-split]
#NAME=TEST: -ts 21,10 + exps=CPU (issue #19816)
#MODEL=Qwen3-Next/UD-Q5_K_XL/Qwen3-Next-80B-A3B-Instruct-UD-Q5_K_XL-00001-of-00002.gguf
#CTX_SIZE=262144
#N_GPU_LAYERS=99
#FIT=off
#EXTRA_ARGS=--jinja -np 1 -b 2048 -ub 512 --no-context-shift --tensor-split 21,10 -ot exps=CPU
#
# pwilkin suggested --fit without -ot exps=CPU.
# OOM with n_gpu_layers=99. With auto: FIT handled everything, 32.9 t/s.
# This became the new default approach for all profiles.
#[test-fit-only]
#NAME=TEST: FIT only, no -ot (issue #19816)
#MODEL=Qwen3-Next/UD-Q5_K_XL/Qwen3-Next-80B-A3B-Instruct-UD-Q5_K_XL-00001-of-00002.gguf
#CTX_SIZE=262144
#FIT=on
#EXTRA_ARGS=--jinja -np 1 --no-context-shift