From dc880eeb9a065c01d495e66a3c63b8bd4ce110d9 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 17:56:43 -0800 Subject: [PATCH] =?UTF-8?q?Rename=20submission=20modes:=20benchmark?= =?UTF-8?q?=E2=86=92private,=20leaderboard=E2=86=92public?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates all eval.py files to use new mode values: - mode == "benchmark" → mode == "private" - mode == "leaderboard" → mode == "public" --- problems/amd/eval.py | 4 ++-- problems/amd/mla-decode/eval.py | 6 +++--- problems/amd_distributed/eval.py | 4 ++-- problems/bioml/trimul/eval.py | 4 ++-- problems/nvidia/eval.py | 4 ++-- problems/nvidia/eval_better_bench.py | 4 ++-- problems/nvidia/eval_better_bench_grouped_gemm.py | 4 ++-- problems/nvidia/nvfp4_gemm/eval.py | 4 ++-- problems/nvidia/nvfp4_group_gemm/eval.py | 4 ++-- problems/pmpp/eval.py | 6 +++--- problems/pmpp_v2/eval.py | 4 ++-- 11 files changed, 24 insertions(+), 24 deletions(-) diff --git a/problems/amd/eval.py b/problems/amd/eval.py index ac3a6325..6be4c5ea 100644 --- a/problems/amd/eval.py +++ b/problems/amd/eval.py @@ -349,10 +349,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/amd/mla-decode/eval.py b/problems/amd/mla-decode/eval.py index 3f67cd64..e1ef296f 100644 --- a/problems/amd/mla-decode/eval.py +++ b/problems/amd/mla-decode/eval.py @@ -294,10 +294,10 @@ def main(): if mode == "test": return run_testing(logger, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, tests) - - if mode == "leaderboard": + + if mode == "public": warm_up(tests[0]) result = benchmark(tests[-1], True, 100, 30e9) if isinstance(result, Stats): diff --git a/problems/amd_distributed/eval.py b/problems/amd_distributed/eval.py index c3d20f90..0ca65454 100644 --- a/problems/amd_distributed/eval.py +++ b/problems/amd_distributed/eval.py @@ -546,10 +546,10 @@ def main(): with mp_context.Pool(n_gpus) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py index be957134..698e96e5 100644 --- a/problems/bioml/trimul/eval.py +++ b/problems/bioml/trimul/eval.py @@ -352,10 +352,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py index 252f35e4..fa2c2754 100644 --- a/problems/nvidia/eval.py +++ b/problems/nvidia/eval.py @@ -449,10 +449,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": run_single_benchmark(pool, tests[0], False, 1000, 5e8) logger.log("benchmark-count", len(tests)) passed = True diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py index 007781ed..70ab480a 100644 --- a/problems/nvidia/eval_better_bench.py +++ b/problems/nvidia/eval_better_bench.py @@ -472,10 +472,10 @@ def main(): with mp_context.Pool(1, initializer=_init_worker) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # Warmup all test shapes to ensure consistent benchmarking for test in tests: run_single_benchmark(pool, test, False, 1000, 5e8) diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py index 09b52790..653b3331 100644 --- a/problems/nvidia/eval_better_bench_grouped_gemm.py +++ b/problems/nvidia/eval_better_bench_grouped_gemm.py @@ -491,10 +491,10 @@ def main(): with mp_context.Pool(1, initializer=_init_worker) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # Warmup all test shapes to ensure consistent benchmarking for test in tests: run_single_benchmark(pool, test, False, 50, 5e8) diff --git a/problems/nvidia/nvfp4_gemm/eval.py b/problems/nvidia/nvfp4_gemm/eval.py index e8bb5b21..a3879b59 100644 --- a/problems/nvidia/nvfp4_gemm/eval.py +++ b/problems/nvidia/nvfp4_gemm/eval.py @@ -452,10 +452,10 @@ def build_test_string(tests: list[dict]): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # Step 1: Compile kernel once (outside of timing) logger.log("compile", "start") compile_success, compile_error = pool.apply(_compile_kernel_once) diff --git a/problems/nvidia/nvfp4_group_gemm/eval.py b/problems/nvidia/nvfp4_group_gemm/eval.py index 2f00f53d..4290ec78 100644 --- a/problems/nvidia/nvfp4_group_gemm/eval.py +++ b/problems/nvidia/nvfp4_group_gemm/eval.py @@ -392,10 +392,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/pmpp/eval.py b/problems/pmpp/eval.py index 5815e38a..8b466a37 100644 --- a/problems/pmpp/eval.py +++ b/problems/pmpp/eval.py @@ -246,10 +246,10 @@ def main(): if mode == "test": return run_testing(logger, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, tests) - - if mode == "leaderboard": + + if mode == "public": warm_up(tests[0]) result = benchmark(tests[-1], True, 100, 30e9) if isinstance(result, Stats): diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py index 981b9322..ed8d976b 100644 --- a/problems/pmpp_v2/eval.py +++ b/problems/pmpp_v2/eval.py @@ -343,10 +343,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests))