Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions .github/workflows/run-sweep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,36 @@ jobs:
name: "run-stats"
path: ${{ env.STATS_FILENAME }}.json

compare-results:
needs:
[
collect-results,
setup,
]
if: >-
always() &&
github.event_name == 'pull_request' &&
needs.collect-results.result == 'success'
runs-on: ubuntu-latest

env:
DATABASE_URL: ${{ secrets.NEON_PROD_RO_URL }}

steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

- name: Download results artifacts
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 # v8.0.0
with:
path: results/
pattern: results_bmk

- name: Install dependencies
run: pip install psycopg2-binary tabulate

- name: Compare results against main
run: python3 utils/compare_results.py results/ >> $GITHUB_STEP_SUMMARY

trigger-vercel-deploy:
needs:
[
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -881,3 +881,11 @@
- "Expanding TP search space"
- "Adding kv-cache-fp8"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/865

- config-keys:
- gptoss-fp4-b200-vllm
- dsr1-fp8-h200-dynamo-trt
- dsr1-fp4-mi355x-sglang-disagg
description:
- "test"
pr-link: test
262 changes: 262 additions & 0 deletions utils/compare_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
import json
import os
import re
import sys
from pathlib import Path

import psycopg2
from tabulate import tabulate


def parse_bool(value):
if isinstance(value, bool):
return value
return str(value).lower() == "true"


def extract_hardware(runner):
"""Strip suffixes like -multinode, -trt, -disagg from runner to get hardware name."""
return re.split(r"-(multinode|trt|disagg)$", runner)[0].lower()


def build_config_params(result):
"""Build the DB config lookup parameters from a result JSON."""
is_multinode = result.get("is_multinode", False)
hw = extract_hardware(result["hw"])
model = result["infmax_model_prefix"].lower()
framework = result["framework"].lower()
precision = result["precision"].lower()
spec_method = result.get("spec_decoding", "none").lower()
disagg = parse_bool(result.get("disagg", False))

if is_multinode:
return {
"hardware": hw,
"model": model,
"framework": framework,
"precision": precision,
"spec_method": spec_method,
"disagg": disagg,
"is_multinode": True,
"prefill_tp": int(result["prefill_tp"]),
"prefill_ep": int(result["prefill_ep"]),
"prefill_dp_attention": parse_bool(result["prefill_dp_attention"]),
"decode_tp": int(result["decode_tp"]),
"decode_ep": int(result["decode_ep"]),
"decode_dp_attention": parse_bool(result["decode_dp_attention"]),
}
else:
tp = int(result["tp"])
ep = int(result["ep"])
dp_attention = parse_bool(result["dp_attention"])
return {
"hardware": hw,
"model": model,
"framework": framework,
"precision": precision,
"spec_method": spec_method,
"disagg": disagg,
"is_multinode": False,
"prefill_tp": tp,
"prefill_ep": ep,
"prefill_dp_attention": dp_attention,
"decode_tp": tp,
"decode_ep": ep,
"decode_dp_attention": dp_attention,
}


# Use LIKE prefix match on model to handle cases where DB model name
# differs from model-prefix (e.g. model-prefix "gptoss" -> DB "gptoss120b")
BASELINE_QUERY = """
SELECT br.metrics->>'tput_per_gpu' as tput_per_gpu,
c.model as db_model
FROM benchmark_results br
JOIN configs c ON c.id = br.config_id
JOIN workflow_runs wr ON wr.id = br.workflow_run_id
WHERE c.hardware = %(hardware)s
AND c.framework = %(framework)s
AND c.model LIKE %(model)s || '%%'
AND c.precision = %(precision)s
AND c.spec_method = %(spec_method)s
AND c.disagg = %(disagg)s
AND c.is_multinode = %(is_multinode)s
AND c.prefill_tp = %(prefill_tp)s
AND c.prefill_ep = %(prefill_ep)s
AND c.prefill_dp_attention = %(prefill_dp_attention)s
AND c.decode_tp = %(decode_tp)s
AND c.decode_ep = %(decode_ep)s
AND c.decode_dp_attention = %(decode_dp_attention)s
AND br.isl = %(isl)s
AND br.osl = %(osl)s
AND br.conc = %(conc)s
AND wr.head_branch = 'main'
AND br.error IS NULL
ORDER BY br.date DESC
LIMIT 1
"""


def main():
if len(sys.argv) < 2:
print("Usage: python compare_results.py <results_dir>")
sys.exit(1)

results_dir = Path(sys.argv[1])
database_url = os.environ["DATABASE_URL"]

# Load all benchmark result JSONs (files may contain a single dict or a list of dicts)
results = []
for path in results_dir.rglob("*.json"):
with open(path) as f:
data = json.load(f)
if isinstance(data, list):
results.extend(data)
else:
results.append(data)

print(f"Loaded {len(results)} benchmark results", file=sys.stderr)

if not results:
print("No benchmark results found to compare.")
return

conn = psycopg2.connect(database_url)
rows = []
matched = 0
unmatched = 0

for r in results:
config_params = build_config_params(r)
query_params = {
**config_params,
"isl": int(r["isl"]),
"osl": int(r["osl"]),
"conc": int(r["conc"]),
}

print(f"\nQuery params: {json.dumps({k: str(v) for k, v in query_params.items()}, indent=2)}", file=sys.stderr)

with conn.cursor() as cur:
cur.execute(BASELINE_QUERY, query_params)
row = cur.fetchone()

if row:
matched += 1
print(f" -> Matched DB model={row[1]}, tput={row[0]}", file=sys.stderr)
else:
unmatched += 1
print(f" -> No baseline found", file=sys.stderr)

current_tput = float(r["tput_per_gpu"])
baseline_tput = float(row[0]) if row else None

if baseline_tput is not None and baseline_tput > 0:
delta = current_tput - baseline_tput
pct = (delta / baseline_tput) * 100
delta_str = f"{delta:+.2f} ({pct:+.1f}%)"
else:
delta_str = "N/A (no baseline)"

is_multinode = r.get("is_multinode", False)
if is_multinode:
parallelism = (
f"P(tp{r['prefill_tp']}/ep{r['prefill_ep']}) "
f"D(tp{r['decode_tp']}/ep{r['decode_ep']})"
)
else:
parallelism = f"tp{r['tp']}/ep{r['ep']}"

row_data = {
"model": r["infmax_model_prefix"],
"served_model": r["model"],
"hw": extract_hardware(r["hw"]).upper(),
"framework": r["framework"].upper(),
"precision": r["precision"].upper(),
"parallelism": parallelism,
"isl": int(r["isl"]),
"osl": int(r["osl"]),
"conc": int(r["conc"]),
"current": current_tput,
"baseline": baseline_tput,
"delta_str": delta_str,
}
if not is_multinode:
row_data["dp_attention"] = r.get("dp_attention", False)
rows.append(row_data)

conn.close()

print(f"\nSummary: {matched} matched, {unmatched} unmatched out of {len(results)} results", file=sys.stderr)

rows.sort(key=lambda x: (x["model"], x["hw"], x["framework"], x["isl"], x["osl"], x["conc"]))

single_node = [r for r in rows if "P(" not in r["parallelism"]]
multi_node = [r for r in rows if "P(" in r["parallelism"]]

if single_node:
headers = [
"Model", "Served Model", "Hardware", "Framework", "Precision",
"ISL", "OSL", "TP", "EP", "DP Attention", "Conc",
"TPUT per GPU", "Baseline TPUT per GPU", "Delta",
]
table_rows = []
for row in single_node:
parts = row["parallelism"] # "tp1/ep1"
tp_val = parts.split("/")[0].replace("tp", "")
ep_val = parts.split("/")[1].replace("ep", "")
table_rows.append([
row["model"],
row["served_model"],
row["hw"],
row["framework"],
row["precision"],
row["isl"],
row["osl"],
tp_val,
ep_val,
row.get("dp_attention", False),
row["conc"],
f"{row['current']:.4f}",
f"{row['baseline']:.4f}" if row["baseline"] is not None else "N/A",
row["delta_str"],
])

print("## Single-Node Throughput Comparison vs. Most Recent\n")
print(tabulate(table_rows, headers=headers, tablefmt="github"))
print()

if multi_node:
headers = [
"Model", "Served Model", "Hardware", "Framework", "Precision",
"ISL", "OSL", "Prefill TP", "Prefill EP", "Decode TP", "Decode EP",
"Conc", "TPUT per GPU", "Baseline TPUT per GPU", "Delta",
]
table_rows = []
for row in multi_node:
# Parse P(tp4/ep4) D(tp8/ep8)
m = re.match(r"P\(tp(\d+)/ep(\d+)\) D\(tp(\d+)/ep(\d+)\)", row["parallelism"])
table_rows.append([
row["model"],
row["served_model"],
row["hw"],
row["framework"],
row["precision"],
row["isl"],
row["osl"],
m.group(1) if m else "",
m.group(2) if m else "",
m.group(3) if m else "",
m.group(4) if m else "",
row["conc"],
f"{row['current']:.4f}",
f"{row['baseline']:.4f}" if row["baseline"] is not None else "N/A",
row["delta_str"],
])

print("## Multi-Node Throughput Comparison vs. Most Recent\n")
print(tabulate(table_rows, headers=headers, tablefmt="github"))


if __name__ == "__main__":
main()