From ff1143c04accb59d081d26378b3db1e9e11b8464 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 1 Mar 2026 04:13:16 +0000 Subject: [PATCH 1/4] [AMD] Add MiniMax M2.1 MXFP4 benchmark for MI355x vLLM (TP=2,4) Add MiniMax M2.1 MXFP4 benchmark config for MI355x with vLLM v0.16.0. - Model: amd/MiniMax-M2.1-MXFP4 - TP=2 and TP=4 (matching MiniMax M2.5 FP8 pattern) - Only VLLM_ROCM_USE_AITER=1 env var (per Andy Luo recipe) - Seq lengths: 1k1k, 1k8k, 8k1k (conc 4-64) Closes #826 Co-authored-by: functionstackx --- .github/configs/amd-master.yaml | 25 ++++++++ .../single_node/minimaxm2.1_fp4_mi355x.sh | 63 +++++++++++++++++++ perf-changelog.yaml | 10 +++ 3 files changed, 98 insertions(+) create mode 100755 benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e688f6b91..77bc28b07 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -229,6 +229,31 @@ minimaxm2.5-fp8-mi355x-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } +minimaxm2.1-fp4-mi355x-vllm: + image: vllm/vllm-openai-rocm:v0.16.0 + model: amd/MiniMax-M2.1-MXFP4 + model-prefix: minimaxm2.1 + runner: mi355x + precision: fp4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 model: openai/gpt-oss-120b diff --git a/benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh b/benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh new file mode 100755 index 000000000..b6ea5f65d --- /dev/null +++ b/benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + MAX_MODEL_LEN \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + +export VLLM_ROCM_USE_AITER=1 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +set -x +vllm serve $MODEL --port $PORT \ +--tensor-parallel-size=$TP \ +--gpu-memory-utilization 0.95 \ +--max-model-len $MAX_MODEL_LEN \ +--block-size=32 \ +--disable-log-requests \ +--trust-remote-code > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +# Wait for server to be ready +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ \ + --trust-remote-code + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 30782b124..d9ca9d552 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -754,3 +754,13 @@ - "Update SGLang image from v0.5.8 to v0.5.9 for AMD single-node DeepSeek R1 configs" - "Key changes: AITER v0.1.10.post3 with FP8 Prefill/Decode/KV Cache, FP8 prefill attention kernel, MORI EP two-batch overlapping, OOM fix for DeepSeek weight loading" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/816 + +- config-keys: + - minimaxm2.1-fp4-mi355x-vllm + description: + - "Add MiniMax M2.1 MXFP4 vLLM benchmark for MI355X" + - "Model: amd/MiniMax-M2.1-MXFP4 with --trust-remote-code and --block-size=32" + - "Image: vllm/vllm-openai-rocm:v0.16.0" + - "Environment: VLLM_ROCM_USE_AITER=1" + - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 62c090718b18ad4c2c22d635de423a1c1fcfe00a Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 1 Mar 2026 04:44:38 +0000 Subject: [PATCH 2/4] Disable TP=4 for MiniMax M2.1 MXFP4 due to vLLM bug TP=4 is bugged for this model per vllm-project/vllm#35637. Comment out TP=4 search-space entries, keeping only TP=2. Co-authored-by: functionstackx --- .github/configs/amd-master.yaml | 9 ++++++--- perf-changelog.yaml | 5 +++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 77bc28b07..442ff2ebc 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -242,17 +242,20 @@ minimaxm2.1-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + # TP=4 disabled due to vLLM bug: https://github.com/vllm-project/vllm/issues/35637 + # - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + # TP=4 disabled due to vLLM bug: https://github.com/vllm-project/vllm/issues/35637 + # - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } + # TP=4 disabled due to vLLM bug: https://github.com/vllm-project/vllm/issues/35637 + # - { tp: 4, conc-start: 4, conc-end: 64 } gptoss-fp4-mi300x-vllm: image: vllm/vllm-openai-rocm:v0.16.0 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index d9ca9d552..02f937aec 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -762,5 +762,6 @@ - "Model: amd/MiniMax-M2.1-MXFP4 with --trust-remote-code and --block-size=32" - "Image: vllm/vllm-openai-rocm:v0.16.0" - "Environment: VLLM_ROCM_USE_AITER=1" - - "TP=2 and TP=4, concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + - "TP=2 only (TP=4 disabled due to vLLM bug https://github.com/vllm-project/vllm/issues/35637)" + - "Concurrency 4-64 for 1k1k, 1k8k, and 8k1k sequence lengths" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/827 From ebf8a84f25a3239898cacfd0a11daa972617488b Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Tue, 3 Mar 2026 14:15:38 +0000 Subject: [PATCH 3/4] Fix Issue #35637 Author: hongxiayang - Keep AITER for attention but disable it specifically for MoE, so the fused MoE falls back to triton kernels that can handle N=384, when TP=4 and N=192 when TP=8. - Install the amd-quark library to fix the crash when TP=4 with VLLM_ROCM_USE_AITER_MOE=0. --- benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh b/benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh index b6ea5f65d..bdd213ad0 100755 --- a/benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.1_fp4_mi355x.sh @@ -24,6 +24,14 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +if [ "$TP" -ge 4 ]; then + # AITER CK fused MoE kernels lack compiled tiles for N=intermediate_size/TP + # when TP>=4 (TP=4, N=384). Disable AITER MoE to fall back to triton, but keep + # AITER attention. See: https://github.com/vllm-project/vllm/issues/35637 + export VLLM_ROCM_USE_AITER_MOE=0 + export VLLM_ATTENTION_BACKEND="ROCM_AITER_UNIFIED_ATTN" + pip install amd-quark 2>/dev/null || true +fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} From ba52e2ce0b286148a10d3aa5150823ca437f7e33 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Tue, 3 Mar 2026 14:47:52 +0000 Subject: [PATCH 4/4] Activate TP4 for MiniMax2.1 FP4 MI355 vLLM --- .github/configs/amd-master.yaml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index a6a82ff25..5d3d35a1c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -327,20 +327,17 @@ minimaxm2.1-fp4-mi355x-vllm: osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - # TP=4 disabled due to vLLM bug: https://github.com/vllm-project/vllm/issues/35637 - # - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 1024 osl: 8192 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - # TP=4 disabled due to vLLM bug: https://github.com/vllm-project/vllm/issues/35637 - # - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } - isl: 8192 osl: 1024 search-space: - { tp: 2, conc-start: 4, conc-end: 64 } - # TP=4 disabled due to vLLM bug: https://github.com/vllm-project/vllm/issues/35637 - # - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } minimaxm2.5-fp8-mi355x-vllm: image: vllm/vllm-openai-rocm:v0.15.1