SemiAnalysisAI · cquil11 · Jan 26, 2026 · Jan 26, 2026 · Jan 26, 2026 · Jan 27, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+experimental/multiturn/vllm_benchmark/sample_20k_realistic.json filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/benchmark-multiturn-tmpl.yml b/.github/workflows/benchmark-multiturn-tmpl.yml
@@ -0,0 +1,158 @@
+name: Template - Multi-Turn Benchmark
+on:
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      model:
+        required: true
+        type: string
+      precision:
+        required: false
+        type: string
+        default: 'fp4'
+      exp-name:
+        required: true
+        type: string
+      tp:
+        required: true
+        type: string
+      users:
+        required: true
+        type: string
+      offload-mode:
+        description: "on = prefix+offload, off = prefix only, noprefix = no prefix caching"
+        required: true
+        type: string
+      duration:
+        required: true
+        type: string
+      think-time:
+        description: "Log-normal think-time params (mu,sigma)"
+        required: true
+        type: string
+      total-cpu-dram-gb:
+        required: false
+        type: string
+        default: '300'
+      ref:
+        description: "Git ref (branch/sha) to checkout"
+        required: false
+        type: string
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+  EXP_NAME: ${{ inputs.exp-name }}
+  MODEL: ${{ inputs.model }}
+  IMAGE: ${{ inputs.image }}
+  PRECISION: ${{ inputs.precision }}
+  FRAMEWORK: 'vllm'
+  TP: ${{ inputs.tp }}
+  USERS: ${{ inputs.users }}
+  OFFLOAD_MODE: ${{ inputs.offload-mode }}
+  DURATION: ${{ inputs.duration }}
+  THINK_TIME: ${{ inputs.think-time }}
+  TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
+  SPEC_DECODING: 'off'
+
+permissions:
+  contents: read
+
+jobs:
+  benchmark:
+    runs-on: ${{ inputs.runner }}
+    timeout-minutes: 180
+    name: "${{ inputs.exp-name }} tp=${{ inputs.tp }} users=${{ inputs.users }} offload=${{ inputs.offload-mode }}"
+    steps:
+      - name: Resource cleanup (pre-run)
+        run: &resource-cleanup |
+          # Cleanup Docker resources
+          if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
+            echo "[Docker] Cleaning up resources ..."
+            docker ps -aq | xargs -r docker rm -f
+            docker network prune -f
+            while [ -n "$(docker ps -aq)" ]; do
+              docker ps -a
+              sleep 5
+            done
+          fi
+
+          # Cleanup SLURM resources
+          if command -v squeue >/dev/null 2>&1; then
+            if [[ "${{ runner.name }}" == mi355x-amds* || "${{ runner.name }}" == gb200-nv* || "${{ runner.name }}" == gb300-nv* || "${{ runner.name }}" == h100-dgxc-slurm* || "${{ runner.name }}" == h200-dgxc-slurm* || "${{ runner.name }}" == b200-dgxc-slurm* ]]; then
+              echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
+              scancel --name="${{ runner.name }}" || true
+              while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
+                squeue --name="${{ runner.name }}"
+                sleep 5
+              done
+            else
+              echo "[Slurm] Cleaning up jobs for user: $USER ..."
+              scancel -u "$USER" || true
+              while [ -n "$(squeue -u "$USER" --noheader --format='%i')" ]; do
+                squeue -u "$USER"
+                sleep 5
+              done
+            fi
+          fi
+
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.ref }}
+
+
+      - name: Launch job script
+        env:
+          RUNNER_NAME: ${{ runner.name }}
+          RESULT_DIR: /workspace/results
+        run: |
+          bash ./runners/launch_${RUNNER_NAME%%_*}.sh
+
+          # The runner script doesn't propagate exit codes (scancel masks them).
+          # Check status.txt to determine if the benchmark actually succeeded.
+          if [ ! -f results/status.txt ]; then
+            echo "Run failed: results/status.txt not found." >&2
+            exit 1
+          fi
+          STATUS=$(cat results/status.txt)
+          if [ "$STATUS" != "SUCCESS" ]; then
+            echo "Run failed: status=$STATUS" >&2
+            cat results/benchmark.log 2>/dev/null || true
+            exit 1
+          fi
+
+      - name: Upload results
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: "multiturn_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
+          path: |
+            results/metrics_client_metrics.csv
+            results/metrics_server_metrics.csv
+            results/metrics_plots.png
+            results/benchmark.log
+            results/server.log
+            results/config.yaml
+            results/vllm_command.txt
+            results/benchmark_command.txt
+            results/status.txt
+          if-no-files-found: ignore
+
+      - name: Upload server logs
+        if: always()
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: "server_logs_tp${{ inputs.tp }}_users${{ inputs.users }}_offload${{ inputs.offload-mode }}"
+          path: results/server.log
+          if-no-files-found: ignore
+
+      - name: Resource cleanup (post-run)
+        if: always()
+        run: *resource-cleanup
diff --git a/.github/workflows/multiturn-sweep.yml b/.github/workflows/multiturn-sweep.yml
@@ -0,0 +1,120 @@
+name: Multi-Turn Benchmark Sweep
+run-name: "Multi-Turn Sweep - tp=${{ inputs.tp_values }} users=${{ inputs.user_values }} offload=${{ inputs.offload_values }}"
+
+on:
+  # push:
+  #   branches:
+  #     - experimental/multi-turn-benchmark
+  #   paths:
+  #     - .github/workflows/multiturn-sweep.yml
+  workflow_dispatch:
+    inputs:
+      tp_values:
+        description: 'TP sizes (JSON array)'
+        required: true
+        default: '[1, 2, 4, 8]'
+        type: string
+      user_values:
+        description: 'Concurrent user counts (JSON array)'
+        required: true
+        default: '[8, 16, 32, 64, 128, 256, 512, 1024, 2048]'
+        type: string
+      offload_values:
+        description: 'Offload modes (JSON array: on/off/noprefix)'
+        required: true
+        default: '["on", "off", "noprefix"]'
+        type: string
+      duration:
+        description: 'Benchmark duration in seconds'
+        required: true
+        default: '300'
+        type: string
+      think_time:
+        description: 'Log-normal think-time params (mu,sigma)'
+        required: true
+        default: '1.39,1.26'
+        type: string
+      total_cpu_dram_gb:
+        description: 'Total CPU DRAM for KV offload (GB)'
+        required: true
+        default: '100'
+        type: string
+      image:
+        description: 'Container image'
+        required: true
+        default: 'vllm/vllm-openai:v0.16.0'
+        type: string
+      model:
+        description: 'Model name'
+        required: true
+        default: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+        type: string
+      ref:
+        description: 'Git ref (branch/sha) to checkout'
+        required: false
+        type: string
+
+jobs:
+  # ---------------------------------------------------------------------------
+  # Matrix benchmark jobs — each cell calls the multiturn template
+  # ---------------------------------------------------------------------------
+  sweep:
+    uses: ./.github/workflows/benchmark-multiturn-tmpl.yml
+    name: sweep /
+    strategy:
+      fail-fast: false
+      matrix:
+        tp: ${{ fromJson(inputs.tp_values) }}
+        users: ${{ fromJson(inputs.user_values) }}
+        offload: ${{ fromJson(inputs.offload_values) }}
+    secrets: inherit
+    with:
+      runner: b200
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      exp-name: "multiturn_tp${{ matrix.tp }}_users${{ matrix.users }}_offload${{ matrix.offload }}"
+      tp: "${{ matrix.tp }}"
+      users: "${{ matrix.users }}"
+      offload-mode: ${{ matrix.offload }}
+      duration: ${{ inputs.duration }}
+      think-time: ${{ inputs.think_time }}
+      total-cpu-dram-gb: ${{ inputs.total_cpu_dram_gb }}
+      ref: ${{ inputs.ref }}
+
+  # ---------------------------------------------------------------------------
+  # Collect & aggregate results
+  # ---------------------------------------------------------------------------
+  collect:
+    runs-on: ubuntu-latest
+    needs: sweep
+    if: always()
+    name: Collect results
+    steps:
+      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 1
+          ref: ${{ inputs.ref || github.ref }}
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install pandas matplotlib numpy
+
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: 'multiturn_*'
+          path: results/
+
+      - name: Run aggregation
+        run: |
+          python experimental/multiturn/vllm_benchmark/scripts/collect_sweep_results.py results/ aggregated/
+
+      - name: Upload aggregated results
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        with:
+          name: multiturn_aggregated
+          path: aggregated/
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 **/__pycache__/**
-**/.coverage
+**/.coverage
+# Large data files
+experimental/multiturn/vllm_benchmark/sharegpt_20230401_clean_lang_split.json
+experimental/qwen_traceA_blksz_16.jsonl
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		experimental/multiturn/vllm_benchmark/sample_20k_realistic.json filter=lfs diff=lfs merge=lfs -text