diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 543864d4a..3bc9b81ad 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3260,7 +3260,7 @@ dsr1-fp8-h100-dynamo-sglang: dp-attn: true gptoss-fp4-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev + image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc5 model: openai/gpt-oss-120b model-prefix: gptoss runner: h200 diff --git a/benchmarks/single_node/gptoss_fp4_h200_trt.sh b/benchmarks/single_node/gptoss_fp4_h200_trt.sh index f230cd9b0..d4891130c 100644 --- a/benchmarks/single_node/gptoss_fp4_h200_trt.sh +++ b/benchmarks/single_node/gptoss_fp4_h200_trt.sh @@ -13,9 +13,6 @@ check_env_vars \ DP_ATTENTION \ EP_SIZE -# TensorRT bug. Remove when fixed -sed -i '417d' /usr/local/lib/python3.12/dist-packages/tensorrt_llm/executor/result.py - if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi @@ -36,7 +33,6 @@ cuda_graph_config: enable_attention_dp: $DP_ATTENTION kv_cache_config: dtype: auto - enable_block_reuse: false free_gpu_memory_fraction: 0.85 moe_config: backend: TRITON diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 006522fc8..6b0beb137 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -824,3 +824,12 @@ - "Uses triton attention backend, TP=8, concurrency 4-64" - "Following AMD Andy Luo's recipe" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/850 + +- config-keys: + - gptoss-fp4-h200-trt + description: + - "Upgrade TensorRT-LLM container from release:gpt-oss-dev to release:v1.3.0rc5" + - "Remove sed hack for TensorRT bug (fixed upstream in v1.3.0rc5)" + - "Remove enable_block_reuse: false from kv_cache_config (default true is now recommended)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/854 +