diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e87246b59..17614991d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -413,7 +413,7 @@ minimaxm2.5-fp8-mi325x-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } gptoss-fp4-mi300x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -444,7 +444,7 @@ gptoss-fp4-mi300x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi325x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 + image: vllm/vllm-openai-rocm:v0.17.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi325x @@ -475,8 +475,8 @@ gptoss-fp4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.16.0 - model: openai/gpt-oss-120b + image: vllm/vllm-openai-rocm:v0.17.0 + model: amd/gpt-oss-120b-w-mxfp4-a-fp8 model-prefix: gptoss runner: mi355x precision: fp4 diff --git a/benchmarks/single_node/gptoss_fp4_mi300x.sh b/benchmarks/single_node/gptoss_fp4_mi300x.sh index 7b64418e7..14eef7d5c 100644 --- a/benchmarks/single_node/gptoss_fp4_mi300x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi300x.sh @@ -33,23 +33,23 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi325x.sh b/benchmarks/single_node/gptoss_fp4_mi325x.sh index c8edf0c15..14eef7d5c 100644 --- a/benchmarks/single_node/gptoss_fp4_mi325x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi325x.sh @@ -33,22 +33,23 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/gptoss_fp4_mi355x.sh b/benchmarks/single_node/gptoss_fp4_mi355x.sh index 2012db23d..b7c9c3ddb 100644 --- a/benchmarks/single_node/gptoss_fp4_mi355x.sh +++ b/benchmarks/single_node/gptoss_fp4_mi355x.sh @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" fi +export AMDGCN_USE_BUFFER_OPS=0 export VLLM_ROCM_USE_AITER=1 -export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1 -export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 +ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN" +FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} set -x vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ ---gpu-memory-utilization 0.95 \ ---max-model-len $MAX_MODEL_LEN \ ---compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ ---block-size=64 \ ---no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & + $ATTN_BACKEND $FUSE_ROPE_KVCACHE \ + --tensor-parallel-size=$TP \ + --gpu-memory-utilization 0.95 \ + --max-model-len $MAX_MODEL_LEN \ + --block-size=64 \ + --no-enable-prefix-caching > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1106e7b6c..e6a85927f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -911,3 +911,17 @@ - "Redo qwen eval" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892 evals-only: true + +- config-keys: + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + - gptoss-fp4-mi355x-vllm + description: + - "Update AMD GPT-OSS vLLM image from v0.16.0 to v0.17.0 for MI300X, MI325X, and MI355X" + - "MI355X: Switch model to amd/gpt-oss-120b-w-mxfp4-a-fp8 (MXFP4 weights + FP8 activations)" + - "MI355X: Add VLLM_ROCM_USE_AITER_TRITON_ROPE=1 for AITER triton RoPE kernel" + - "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars" + - "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass" + - "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867 + diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index f4f1e561f..fc04f5bb3 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -156,6 +156,7 @@ else PARTITION="compute" SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + LOCK_FILE="${SQUASH_FILE}.lock" set -x salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME" @@ -163,16 +164,22 @@ else srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)" - if [[ "$FRAMEWORK" == "atom" ]]; then - srun --jobid=$JOB_ID bash -c "rm $SQUASH_FILE" - fi + # Use flock to serialize concurrent imports to the same squash file + srun --jobid=$JOB_ID bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if [[ \"$FRAMEWORK\" == \"atom\" ]]; then + rm -f \"$SQUASH_FILE\" + fi + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then - echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..." - srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE" - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - fi + export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \