diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b4084e5e4..178694a78 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1650,7 +1650,7 @@ dsr1-fp8-b300-dynamo-trt: dp-attn: true dsr1-fp4-b200-sglang: - image: lmsysorg/sglang:v0.5.6-cu129-amd64 + image: lmsysorg/sglang:v0.5.9-cu129-amd64 model: nvidia/DeepSeek-R1-0528-FP4-V2 model-prefix: dsr1 runner: b200 @@ -1760,7 +1760,7 @@ dsr1-fp4-b200-trt-mtp: - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } dsr1-fp8-b200-sglang: - image: lmsysorg/sglang:v0.5.6-cu129-amd64 + image: lmsysorg/sglang:v0.5.9-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: b200 @@ -1827,7 +1827,7 @@ kimik2.5-int4-b200-vllm: - { tp: 8, conc-start: 4, conc-end: 64 } dsr1-fp8-b200-sglang-mtp: - image: lmsysorg/sglang:v0.5.8-cu130-amd64 + image: lmsysorg/sglang:v0.5.9-cu130-amd64 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: b200 @@ -1906,7 +1906,7 @@ dsr1-fp8-b200-trt-mtp: - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-h200-sglang: - image: lmsysorg/sglang:v0.5.7-cu129-amd64 + image: lmsysorg/sglang:v0.5.9-cu129-amd64 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: h200 diff --git a/benchmarks/single_node/dsr1_fp8_b200.sh b/benchmarks/single_node/dsr1_fp8_b200.sh index 015f45f30..5d1088b5e 100644 --- a/benchmarks/single_node/dsr1_fp8_b200.sh +++ b/benchmarks/single_node/dsr1_fp8_b200.sh @@ -20,7 +20,7 @@ nvidia-smi hf download "$MODEL" -export SGL_ENABLE_JIT_DEEPGEMM=false +export SGLANG_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4c9e438..9feb2499f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -733,4 +733,19 @@ - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)" - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699 - + +- config-keys: + - dsr1-fp4-b200-sglang + - dsr1-fp8-b200-sglang + - dsr1-fp8-b200-sglang-mtp + - dsr1-fp8-h200-sglang + description: + - "Update SGLang image to v0.5.9 for NVIDIA single-node DeepSeek R1 configs" + - "dsr1-fp4-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu129-amd64" + - "dsr1-fp8-b200-sglang: v0.5.6-cu129-amd64 → v0.5.9-cu129-amd64" + - "dsr1-fp8-b200-sglang-mtp: v0.5.8-cu130-amd64 → v0.5.9-cu130-amd64" + - "dsr1-fp8-h200-sglang: v0.5.7-cu129-amd64 → v0.5.9-cu129-amd64" + - "Fix deprecated SGL_ENABLE_JIT_DEEPGEMM → SGLANG_ENABLE_JIT_DEEPGEMM in dsr1_fp8_b200.sh" + - "SGLang 0.5.9 gains: Flashinfer 0.6.3, TRT-LLM NSA kernels for Blackwell, SpecV2 GC bug fix, MoE fused kernel optimizations" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/814 +