From f3f5c75e72f955425b2ab3b07e6088e4812a4168 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Tue, 10 Mar 2026 12:45:20 -0700 Subject: [PATCH 1/3] add kimi config --- .github/configs/nvidia-master.yaml | 25 ++++++++ benchmarks/single_node/kimik2.5_fp4_b200.sh | 65 +++++++++++++++++++++ perf-changelog.yaml | 6 ++ 3 files changed, 96 insertions(+) create mode 100644 benchmarks/single_node/kimik2.5_fp4_b200.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index fc837704c..0cb724d98 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1873,6 +1873,31 @@ kimik2.5-int4-h200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } +kimik2.5-fp4-b200-vllm: + image: vllm/vllm-openai:v0.17.0 + model: nvidia/Kimi-K2.5-NVFP4 + model-prefix: kimik2.5 + runner: b200 + precision: fp4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 128 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + dsr1-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.8-cu130-amd64 model: deepseek-ai/DeepSeek-R1-0528 diff --git a/benchmarks/single_node/kimik2.5_fp4_b200.sh b/benchmarks/single_node/kimik2.5_fp4_b200.sh new file mode 100644 index 000000000..21edfa480 --- /dev/null +++ b/benchmarks/single_node/kimik2.5_fp4_b200.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +export TORCH_CUDA_ARCH_LIST="10.0" +export PYTHONNOUSERSITE=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +set -x +vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.90 \ +--max-model-len $MAX_MODEL_LEN \ +--max-num-seqs $CONC \ +--reasoning-parser kimi_k2 \ +--tool-call-parser kimi_k2 \ +--compilation_config.pass_config.fuse_allreduce_rms true \ +--trust-remote-code > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $(( CONC * 10 )) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x \ No newline at end of file diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c19ddbd1a..6fd6cd0cb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -932,3 +932,9 @@ - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 + - config-keys: + - kimik2.5-fp4-b200-vllm + description: + - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration" + - "Image: vllm/vllm-openai:v0.17.0" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862 From 1a5a85c6638e31bd42a7d30cc81c9951538b0a47 Mon Sep 17 00:00:00 2001 From: Ankur Singh Date: Tue, 10 Mar 2026 12:46:59 -0700 Subject: [PATCH 2/3] Update perf-changelog.yaml --- perf-changelog.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 6fd6cd0cb..ee0608bc1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -938,3 +938,4 @@ - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration" - "Image: vllm/vllm-openai:v0.17.0" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/862 + From d7947dcce59c05865613996a486c67f31ce5c909 Mon Sep 17 00:00:00 2001 From: ankursingh-nv Date: Tue, 10 Mar 2026 12:48:54 -0700 Subject: [PATCH 3/3] fix perf changelog --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ee0608bc1..f113fcbba 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -932,7 +932,7 @@ - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 - - config-keys: +- config-keys: - kimik2.5-fp4-b200-vllm description: - "Add Kimi K2.5 FP4 B200 vLLM benchmark configuration"