From de2710e64ba2edc60cef3b8c4d1e2f77e8a7f651 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 15:15:31 -0600 Subject: [PATCH 1/5] init --- .github/configs/nvidia-master.yaml | 28 ++--- .github/workflows/run-sweep.yml | 30 +++++ perf-changelog.yaml | 6 + utils/compare_results.py | 196 +++++++++++++++++++++++++++++ 4 files changed, 246 insertions(+), 14 deletions(-) create mode 100644 utils/compare_results.py diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 6885f36cb..61c90ca22 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3091,20 +3091,20 @@ gptoss-fp4-b200-vllm: - { tp: 2, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 4 } + # - isl: 1024 + # osl: 8192 + # search-space: + # - { tp: 1, conc-start: 4, conc-end: 128 } + # - { tp: 2, conc-start: 4, conc-end: 128 } + # - { tp: 4, conc-start: 4, conc-end: 64 } + # - { tp: 8, conc-start: 4, conc-end: 8 } + # - isl: 8192 + # osl: 1024 + # search-space: + # - { tp: 1, conc-start: 4, conc-end: 128 } + # - { tp: 2, conc-start: 4, conc-end: 128 } + # - { tp: 4, conc-start: 4, conc-end: 64 } + # - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.15.1 diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 36cb1e8ff..4f531efe7 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -265,6 +265,36 @@ jobs: name: "run-stats" path: ${{ env.STATS_FILENAME }}.json + compare-results: + needs: + [ + collect-results, + setup, + ] + if: >- + always() && + github.event_name == 'pull_request' && + needs.collect-results.result == 'success' + runs-on: ubuntu-latest + + env: + DATABASE_URL: postgresql://neondb_owner:npg_W6SeFBUbTi4z@ep-late-voice-ailrin30-pooler.c-4.us-east-1.aws.neon.tech/neondb?sslmode=require + + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Download results artifacts + uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0 + with: + path: results/ + pattern: results_bmk + + - name: Install dependencies + run: pip install psycopg2-binary tabulate + + - name: Compare results against main + run: python3 utils/compare_results.py results/ >> $GITHUB_STEP_SUMMARY + trigger-vercel-deploy: needs: [ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index dbb3abc88..1ccfb7c6f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -881,3 +881,9 @@ - "Expanding TP search space" - "Adding kv-cache-fp8" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865 + +- config-keys: + - gptoss-fp4-b200-vllm + description: + - "test" + pr-link: test diff --git a/utils/compare_results.py b/utils/compare_results.py new file mode 100644 index 000000000..089d97cad --- /dev/null +++ b/utils/compare_results.py @@ -0,0 +1,196 @@ +import json +import os +import re +import sys +from pathlib import Path + +import psycopg2 +from tabulate import tabulate + + +def parse_bool(value): + if isinstance(value, bool): + return value + return str(value).lower() == "true" + + +def extract_hardware(runner): + """Strip suffixes like -multinode, -trt, -disagg from runner to get hardware name.""" + return re.split(r"-(multinode|trt|disagg)$", runner)[0].lower() + + +def build_config_params(result): + """Build the DB config lookup parameters from a result JSON.""" + is_multinode = result.get("is_multinode", False) + hw = extract_hardware(result["hw"]) + model = result["infmax_model_prefix"].lower() + framework = result["framework"].lower() + precision = result["precision"].lower() + spec_method = result.get("spec_decoding", "none").lower() + disagg = parse_bool(result.get("disagg", False)) + + if is_multinode: + return { + "hardware": hw, + "model": model, + "framework": framework, + "precision": precision, + "spec_method": spec_method, + "disagg": disagg, + "is_multinode": True, + "prefill_tp": int(result["prefill_tp"]), + "prefill_ep": int(result["prefill_ep"]), + "prefill_dp_attention": parse_bool(result["prefill_dp_attention"]), + "decode_tp": int(result["decode_tp"]), + "decode_ep": int(result["decode_ep"]), + "decode_dp_attention": parse_bool(result["decode_dp_attention"]), + } + else: + tp = int(result["tp"]) + ep = int(result["ep"]) + dp_attention = parse_bool(result["dp_attention"]) + return { + "hardware": hw, + "model": model, + "framework": framework, + "precision": precision, + "spec_method": spec_method, + "disagg": disagg, + "is_multinode": False, + "prefill_tp": tp, + "prefill_ep": ep, + "prefill_dp_attention": dp_attention, + "decode_tp": tp, + "decode_ep": ep, + "decode_dp_attention": dp_attention, + } + + +BASELINE_QUERY = """ + SELECT br.metrics->>'tput_per_gpu' as tput_per_gpu + FROM benchmark_results br + JOIN configs c ON c.id = br.config_id + JOIN workflow_runs wr ON wr.id = br.workflow_run_id + WHERE c.hardware = %(hardware)s + AND c.framework = %(framework)s + AND c.model = %(model)s + AND c.precision = %(precision)s + AND c.spec_method = %(spec_method)s + AND c.disagg = %(disagg)s + AND c.is_multinode = %(is_multinode)s + AND c.prefill_tp = %(prefill_tp)s + AND c.prefill_ep = %(prefill_ep)s + AND c.prefill_dp_attention = %(prefill_dp_attention)s + AND c.decode_tp = %(decode_tp)s + AND c.decode_ep = %(decode_ep)s + AND c.decode_dp_attention = %(decode_dp_attention)s + AND br.isl = %(isl)s + AND br.osl = %(osl)s + AND br.conc = %(conc)s + AND wr.head_branch = 'main' + AND br.error IS NULL + ORDER BY br.date DESC + LIMIT 1 +""" + + +def main(): + if len(sys.argv) < 2: + print("Usage: python compare_results.py ") + sys.exit(1) + + results_dir = Path(sys.argv[1]) + database_url = os.environ["DATABASE_URL"] + + # Load all benchmark result JSONs + results = [] + for path in results_dir.rglob("*.json"): + with open(path) as f: + results.append(json.load(f)) + + if not results: + print("No benchmark results found to compare.") + return + + conn = psycopg2.connect(database_url) + rows = [] + + for r in results: + config_params = build_config_params(r) + query_params = { + **config_params, + "isl": int(r["isl"]), + "osl": int(r["osl"]), + "conc": int(r["conc"]), + } + + with conn.cursor() as cur: + cur.execute(BASELINE_QUERY, query_params) + row = cur.fetchone() + + current_tput = float(r["tput_per_gpu"]) + baseline_tput = float(row[0]) if row else None + + if baseline_tput is not None and baseline_tput > 0: + delta = current_tput - baseline_tput + pct = (delta / baseline_tput) * 100 + delta_str = f"{delta:+.2f} ({pct:+.1f}%)" + else: + delta_str = "N/A (no baseline)" + + is_multinode = r.get("is_multinode", False) + if is_multinode: + parallelism = ( + f"P(tp{r['prefill_tp']}/ep{r['prefill_ep']}) " + f"D(tp{r['decode_tp']}/ep{r['decode_ep']})" + ) + else: + parallelism = f"tp{r['tp']}/ep{r['ep']}" + + rows.append({ + "model": r["infmax_model_prefix"], + "hw": extract_hardware(r["hw"]).upper(), + "framework": r["framework"], + "precision": r["precision"], + "parallelism": parallelism, + "isl": int(r["isl"]), + "osl": int(r["osl"]), + "conc": int(r["conc"]), + "current": current_tput, + "baseline": baseline_tput, + "delta_str": delta_str, + }) + + conn.close() + + rows.sort(key=lambda x: (x["model"], x["hw"], x["framework"], x["isl"], x["osl"], x["conc"])) + + headers = [ + "Model", "HW", "Framework", "Precision", "Parallelism", + "ISL", "OSL", "Conc", + "Current (tok/s/gpu)", "Baseline (tok/s/gpu)", "Delta", + ] + + table_rows = [ + [ + row["model"], + row["hw"], + row["framework"], + row["precision"], + row["parallelism"], + row["isl"], + row["osl"], + row["conc"], + f"{row['current']:.2f}", + f"{row['baseline']:.2f}" if row["baseline"] is not None else "N/A", + row["delta_str"], + ] + for row in rows + ] + + print("## Throughput Comparison vs. Most Recent\n") + print(tabulate(table_rows, headers=headers, tablefmt="github")) + + +if __name__ == "__main__": + main() From 653226ce8d685a350bd0b517461ec90b1136cbd5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 15:34:58 -0600 Subject: [PATCH 2/5] testing 1 --- .github/configs/nvidia-master.yaml | 6 +++--- .github/workflows/run-sweep.yml | 2 +- utils/compare_results.py | 8 ++++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 61c90ca22..a6e05185f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3088,9 +3088,9 @@ gptoss-fp4-b200-vllm: osl: 1024 search-space: - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } + # - { tp: 2, conc-start: 4, conc-end: 128 } + # - { tp: 4, conc-start: 4, conc-end: 64 } + # - { tp: 8, conc-start: 4, conc-end: 8 } # - isl: 1024 # osl: 8192 # search-space: diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml index 4f531efe7..69344632e 100644 --- a/.github/workflows/run-sweep.yml +++ b/.github/workflows/run-sweep.yml @@ -278,7 +278,7 @@ jobs: runs-on: ubuntu-latest env: - DATABASE_URL: postgresql://neondb_owner:npg_W6SeFBUbTi4z@ep-late-voice-ailrin30-pooler.c-4.us-east-1.aws.neon.tech/neondb?sslmode=require + DATABASE_URL: ${{ secrets.NEON_PROD_RO_URL }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/utils/compare_results.py b/utils/compare_results.py index 089d97cad..07bbc4042 100644 --- a/utils/compare_results.py +++ b/utils/compare_results.py @@ -102,11 +102,15 @@ def main(): results_dir = Path(sys.argv[1]) database_url = os.environ["DATABASE_URL"] - # Load all benchmark result JSONs + # Load all benchmark result JSONs (files may contain a single dict or a list of dicts) results = [] for path in results_dir.rglob("*.json"): with open(path) as f: - results.append(json.load(f)) + data = json.load(f) + if isinstance(data, list): + results.extend(data) + else: + results.append(data) if not results: print("No benchmark results found to compare.") From 2ba1b81560541321e13e224ecfa87e975033463d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 16:10:15 -0600 Subject: [PATCH 3/5] testing 2 --- .github/configs/nvidia-master.yaml | 2 +- utils/compare_results.py | 120 ++++++++++++++++++++++------- 2 files changed, 92 insertions(+), 30 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a6e05185f..3f64d043f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3087,7 +3087,7 @@ gptoss-fp4-b200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 1, conc-start: 4, conc-end: 8 } # - { tp: 2, conc-start: 4, conc-end: 128 } # - { tp: 4, conc-start: 4, conc-end: 64 } # - { tp: 8, conc-start: 4, conc-end: 8 } diff --git a/utils/compare_results.py b/utils/compare_results.py index 07bbc4042..6ff345da0 100644 --- a/utils/compare_results.py +++ b/utils/compare_results.py @@ -66,14 +66,17 @@ def build_config_params(result): } +# Use LIKE prefix match on model to handle cases where DB model name +# differs from model-prefix (e.g. model-prefix "gptoss" -> DB "gptoss120b") BASELINE_QUERY = """ - SELECT br.metrics->>'tput_per_gpu' as tput_per_gpu + SELECT br.metrics->>'tput_per_gpu' as tput_per_gpu, + c.model as db_model FROM benchmark_results br JOIN configs c ON c.id = br.config_id JOIN workflow_runs wr ON wr.id = br.workflow_run_id WHERE c.hardware = %(hardware)s AND c.framework = %(framework)s - AND c.model = %(model)s + AND c.model LIKE %(model)s || '%%' AND c.precision = %(precision)s AND c.spec_method = %(spec_method)s AND c.disagg = %(disagg)s @@ -112,12 +115,16 @@ def main(): else: results.append(data) + print(f"Loaded {len(results)} benchmark results", file=sys.stderr) + if not results: print("No benchmark results found to compare.") return conn = psycopg2.connect(database_url) rows = [] + matched = 0 + unmatched = 0 for r in results: config_params = build_config_params(r) @@ -128,10 +135,19 @@ def main(): "conc": int(r["conc"]), } + print(f"\nQuery params: {json.dumps({k: str(v) for k, v in query_params.items()}, indent=2)}", file=sys.stderr) + with conn.cursor() as cur: cur.execute(BASELINE_QUERY, query_params) row = cur.fetchone() + if row: + matched += 1 + print(f" -> Matched DB model={row[1]}, tput={row[0]}", file=sys.stderr) + else: + unmatched += 1 + print(f" -> No baseline found", file=sys.stderr) + current_tput = float(r["tput_per_gpu"]) baseline_tput = float(row[0]) if row else None @@ -151,11 +167,12 @@ def main(): else: parallelism = f"tp{r['tp']}/ep{r['ep']}" - rows.append({ + row_data = { "model": r["infmax_model_prefix"], + "served_model": r["model"], "hw": extract_hardware(r["hw"]).upper(), - "framework": r["framework"], - "precision": r["precision"], + "framework": r["framework"].upper(), + "precision": r["precision"].upper(), "parallelism": parallelism, "isl": int(r["isl"]), "osl": int(r["osl"]), @@ -163,37 +180,82 @@ def main(): "current": current_tput, "baseline": baseline_tput, "delta_str": delta_str, - }) + } + if not is_multinode: + row_data["dp_attention"] = r.get("dp_attention", False) + rows.append(row_data) conn.close() + print(f"\nSummary: {matched} matched, {unmatched} unmatched out of {len(results)} results", file=sys.stderr) + rows.sort(key=lambda x: (x["model"], x["hw"], x["framework"], x["isl"], x["osl"], x["conc"])) - headers = [ - "Model", "HW", "Framework", "Precision", "Parallelism", - "ISL", "OSL", "Conc", - "Current (tok/s/gpu)", "Baseline (tok/s/gpu)", "Delta", - ] - - table_rows = [ - [ - row["model"], - row["hw"], - row["framework"], - row["precision"], - row["parallelism"], - row["isl"], - row["osl"], - row["conc"], - f"{row['current']:.2f}", - f"{row['baseline']:.2f}" if row["baseline"] is not None else "N/A", - row["delta_str"], + single_node = [r for r in rows if "P(" not in r["parallelism"]] + multi_node = [r for r in rows if "P(" in r["parallelism"]] + + if single_node: + headers = [ + "Model", "Served Model", "Hardware", "Framework", "Precision", + "ISL", "OSL", "TP", "EP", "DP Attention", "Conc", + "TPUT per GPU", "Baseline TPUT per GPU", "Delta", + ] + table_rows = [] + for row in single_node: + parts = row["parallelism"] # "tp1/ep1" + tp_val = parts.split("/")[0].replace("tp", "") + ep_val = parts.split("/")[1].replace("ep", "") + table_rows.append([ + row["model"], + row["served_model"], + row["hw"], + row["framework"], + row["precision"], + row["isl"], + row["osl"], + tp_val, + ep_val, + row.get("dp_attention", False), + row["conc"], + f"{row['current']:.4f}", + f"{row['baseline']:.4f}" if row["baseline"] is not None else "N/A", + row["delta_str"], + ]) + + print("## Single-Node Throughput Comparison vs. Most Recent\n") + print(tabulate(table_rows, headers=headers, tablefmt="github")) + print() + + if multi_node: + headers = [ + "Model", "Served Model", "Hardware", "Framework", "Precision", + "ISL", "OSL", "Prefill TP", "Prefill EP", "Decode TP", "Decode EP", + "Conc", "TPUT per GPU", "Baseline TPUT per GPU", "Delta", ] - for row in rows - ] + table_rows = [] + for row in multi_node: + # Parse P(tp4/ep4) D(tp8/ep8) + m = re.match(r"P\(tp(\d+)/ep(\d+)\) D\(tp(\d+)/ep(\d+)\)", row["parallelism"]) + table_rows.append([ + row["model"], + row["served_model"], + row["hw"], + row["framework"], + row["precision"], + row["isl"], + row["osl"], + m.group(1) if m else "", + m.group(2) if m else "", + m.group(3) if m else "", + m.group(4) if m else "", + row["conc"], + f"{row['current']:.4f}", + f"{row['baseline']:.4f}" if row["baseline"] is not None else "N/A", + row["delta_str"], + ]) - print("## Throughput Comparison vs. Most Recent\n") - print(tabulate(table_rows, headers=headers, tablefmt="github")) + print("## Multi-Node Throughput Comparison vs. Most Recent\n") + print(tabulate(table_rows, headers=headers, tablefmt="github")) if __name__ == "__main__": From 8618bebbdb446cb7db35d7d111a633814848c96e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 16:24:52 -0600 Subject: [PATCH 4/5] testing 3 --- .github/configs/nvidia-master.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3f64d043f..61c90ca22 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3087,10 +3087,10 @@ gptoss-fp4-b200-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 1, conc-start: 4, conc-end: 8 } - # - { tp: 2, conc-start: 4, conc-end: 128 } - # - { tp: 4, conc-start: 4, conc-end: 64 } - # - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } # - isl: 1024 # osl: 8192 # search-space: From e857d363ed0fa48da60be32e35729e0540de5233 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 6 Mar 2026 16:55:41 -0600 Subject: [PATCH 5/5] testing 4 --- .github/configs/nvidia-master.yaml | 28 ++++++++++++++-------------- perf-changelog.yaml | 2 ++ 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 61c90ca22..6885f36cb 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -3091,20 +3091,20 @@ gptoss-fp4-b200-vllm: - { tp: 2, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 8 } - # - isl: 1024 - # osl: 8192 - # search-space: - # - { tp: 1, conc-start: 4, conc-end: 128 } - # - { tp: 2, conc-start: 4, conc-end: 128 } - # - { tp: 4, conc-start: 4, conc-end: 64 } - # - { tp: 8, conc-start: 4, conc-end: 8 } - # - isl: 8192 - # osl: 1024 - # search-space: - # - { tp: 1, conc-start: 4, conc-end: 128 } - # - { tp: 2, conc-start: 4, conc-end: 128 } - # - { tp: 4, conc-start: 4, conc-end: 64 } - # - { tp: 8, conc-start: 4, conc-end: 4 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.15.1 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 1ccfb7c6f..51e570373 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -884,6 +884,8 @@ - config-keys: - gptoss-fp4-b200-vllm + - dsr1-fp8-h200-dynamo-trt + - dsr1-fp4-mi355x-sglang-disagg description: - "test" pr-link: test