diff --git a/problems/amd/eval.py b/problems/amd/eval.py index ac3a6325..6be4c5ea 100644 --- a/problems/amd/eval.py +++ b/problems/amd/eval.py @@ -349,10 +349,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/amd/mla-decode/eval.py b/problems/amd/mla-decode/eval.py index 3f67cd64..e1ef296f 100644 --- a/problems/amd/mla-decode/eval.py +++ b/problems/amd/mla-decode/eval.py @@ -294,10 +294,10 @@ def main(): if mode == "test": return run_testing(logger, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, tests) - - if mode == "leaderboard": + + if mode == "public": warm_up(tests[0]) result = benchmark(tests[-1], True, 100, 30e9) if isinstance(result, Stats): diff --git a/problems/amd_distributed/eval.py b/problems/amd_distributed/eval.py index c3d20f90..0ca65454 100644 --- a/problems/amd_distributed/eval.py +++ b/problems/amd_distributed/eval.py @@ -546,10 +546,10 @@ def main(): with mp_context.Pool(n_gpus) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/bioml/trimul/eval.py b/problems/bioml/trimul/eval.py index be957134..698e96e5 100644 --- a/problems/bioml/trimul/eval.py +++ b/problems/bioml/trimul/eval.py @@ -352,10 +352,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/nvidia/eval.py b/problems/nvidia/eval.py index 252f35e4..fa2c2754 100644 --- a/problems/nvidia/eval.py +++ b/problems/nvidia/eval.py @@ -449,10 +449,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": run_single_benchmark(pool, tests[0], False, 1000, 5e8) logger.log("benchmark-count", len(tests)) passed = True diff --git a/problems/nvidia/eval_better_bench.py b/problems/nvidia/eval_better_bench.py index 007781ed..70ab480a 100644 --- a/problems/nvidia/eval_better_bench.py +++ b/problems/nvidia/eval_better_bench.py @@ -472,10 +472,10 @@ def main(): with mp_context.Pool(1, initializer=_init_worker) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # Warmup all test shapes to ensure consistent benchmarking for test in tests: run_single_benchmark(pool, test, False, 1000, 5e8) diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py index 09b52790..653b3331 100644 --- a/problems/nvidia/eval_better_bench_grouped_gemm.py +++ b/problems/nvidia/eval_better_bench_grouped_gemm.py @@ -491,10 +491,10 @@ def main(): with mp_context.Pool(1, initializer=_init_worker) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # Warmup all test shapes to ensure consistent benchmarking for test in tests: run_single_benchmark(pool, test, False, 50, 5e8) diff --git a/problems/nvidia/nvfp4_gemm/eval.py b/problems/nvidia/nvfp4_gemm/eval.py index e8bb5b21..a3879b59 100644 --- a/problems/nvidia/nvfp4_gemm/eval.py +++ b/problems/nvidia/nvfp4_gemm/eval.py @@ -452,10 +452,10 @@ def build_test_string(tests: list[dict]): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # Step 1: Compile kernel once (outside of timing) logger.log("compile", "start") compile_success, compile_error = pool.apply(_compile_kernel_once) diff --git a/problems/nvidia/nvfp4_group_gemm/eval.py b/problems/nvidia/nvfp4_group_gemm/eval.py index 2f00f53d..4290ec78 100644 --- a/problems/nvidia/nvfp4_group_gemm/eval.py +++ b/problems/nvidia/nvfp4_group_gemm/eval.py @@ -392,10 +392,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/problems/pmpp/eval.py b/problems/pmpp/eval.py index 5815e38a..8b466a37 100644 --- a/problems/pmpp/eval.py +++ b/problems/pmpp/eval.py @@ -246,10 +246,10 @@ def main(): if mode == "test": return run_testing(logger, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, tests) - - if mode == "leaderboard": + + if mode == "public": warm_up(tests[0]) result = benchmark(tests[-1], True, 100, 30e9) if isinstance(result, Stats): diff --git a/problems/pmpp_v2/eval.py b/problems/pmpp_v2/eval.py index 981b9322..ed8d976b 100644 --- a/problems/pmpp_v2/eval.py +++ b/problems/pmpp_v2/eval.py @@ -343,10 +343,10 @@ def main(): with mp_context.Pool(1) as pool: if mode == "test": return run_testing(logger, pool, tests) - if mode == "benchmark": + if mode == "private": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "public": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests))