Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ minimaxm2.5-fp8-mi325x-vllm:
- { tp: 4, conc-start: 4, conc-end: 64 }

gptoss-fp4-mi300x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
image: vllm/vllm-openai-rocm:v0.17.0
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi300x
Expand Down Expand Up @@ -444,7 +444,7 @@ gptoss-fp4-mi300x-vllm:
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi325x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
image: vllm/vllm-openai-rocm:v0.17.0
model: openai/gpt-oss-120b
model-prefix: gptoss
runner: mi325x
Expand Down Expand Up @@ -475,8 +475,8 @@ gptoss-fp4-mi325x-vllm:
- { tp: 8, conc-start: 4, conc-end: 16 }

gptoss-fp4-mi355x-vllm:
image: vllm/vllm-openai-rocm:v0.16.0
model: openai/gpt-oss-120b
image: vllm/vllm-openai-rocm:v0.17.0
model: amd/gpt-oss-120b-w-mxfp4-a-fp8
model-prefix: gptoss
runner: mi355x
precision: fp4
Expand Down
18 changes: 9 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi300x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,23 +33,23 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
19 changes: 10 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi325x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,23 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
20 changes: 11 additions & 9 deletions benchmarks/single_node/gptoss_fp4_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,22 +33,24 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES"
fi

export AMDGCN_USE_BUFFER_OPS=0
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION=1
export VLLM_ROCM_USE_AITER_MHA=0
export VLLM_ROCM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ATTN_BACKEND="--attention-backend ROCM_AITER_UNIFIED_ATTN"
FUSE_ROPE_KVCACHE="-cc.pass_config.fuse_rope_kvcache=True -cc.use_inductor_graph_partition=True"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

set -x
vllm serve $MODEL --port $PORT \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \
--block-size=64 \
--no-enable-prefix-caching \
--disable-log-requests > $SERVER_LOG 2>&1 &
$ATTN_BACKEND $FUSE_ROPE_KVCACHE \
--tensor-parallel-size=$TP \
--gpu-memory-utilization 0.95 \
--max-model-len $MAX_MODEL_LEN \
--block-size=64 \
--no-enable-prefix-caching > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
14 changes: 14 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -911,3 +911,17 @@
- "Redo qwen eval"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/892
evals-only: true

- config-keys:
- gptoss-fp4-mi300x-vllm
- gptoss-fp4-mi325x-vllm
- gptoss-fp4-mi355x-vllm
description:
- "Update AMD GPT-OSS vLLM image from v0.16.0 to v0.17.0 for MI300X, MI325X, and MI355X"
- "MI355X: Switch model to amd/gpt-oss-120b-w-mxfp4-a-fp8 (MXFP4 weights + FP8 activations)"
- "MI355X: Add VLLM_ROCM_USE_AITER_TRITON_ROPE=1 for AITER triton RoPE kernel"
- "Add AMDGCN_USE_BUFFER_OPS=0 and VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 env vars"
- "Switch to --attention-backend ROCM_AITER_UNIFIED_ATTN and add fuse_rope_kvcache compilation pass"
- "Remove deprecated VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION/VLLM_ROCM_USE_AITER_MHA env vars and compilation-config cudagraph_mode"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/867

25 changes: 16 additions & 9 deletions runners/launch_mi355x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -156,23 +156,30 @@ else

PARTITION="compute"
SQUASH_FILE="/var/lib/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

set -x
salloc --partition=$PARTITION --gres=gpu:$TP --cpus-per-task=128 --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)

srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"

if [[ "$FRAMEWORK" == "atom" ]]; then
srun --jobid=$JOB_ID bash -c "rm $SQUASH_FILE"
fi
# Use flock to serialize concurrent imports to the same squash file
srun --jobid=$JOB_ID bash -c "
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
if [[ \"$FRAMEWORK\" == \"atom\" ]]; then
rm -f \"$SQUASH_FILE\"
fi
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
rm -f \"$SQUASH_FILE\"
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
fi
"

srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
if ! srun --jobid=$JOB_ID bash -c "unsquashfs -l $SQUASH_FILE > /dev/null"; then
echo "unsquashfs failed, removing $SQUASH_FILE and re-importing..."
srun --jobid=$JOB_ID bash -c "rm -f $SQUASH_FILE"
srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
fi
export VLLM_CACHE_ROOT="/it-share/gharunners/.cache/vllm"

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
Expand Down
Loading