From 583bf088b277f2cd9977f5a3704359c9356d96fa Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 6 Feb 2026 16:01:03 -0800 Subject: [PATCH 1/2] Fix benchmark exploit via object-identity caching The benchmark harness was vulnerable to submissions that cache results based on Python object identity (e.g., id(tensor)). Since the same data objects were reused across all timing iterations, a submission could cache on first call and return cached results on subsequent calls, showing artificial speedups of 12-36%. Changes: - Clone data before each timing iteration (outside the timed region) to give each iteration fresh object identities while not affecting measured kernel time - Use local seed variable instead of mutating test.args["seed"] to avoid shared mutable state between benchmark runs --- problems/nvidia/eval_better_bench_grouped_gemm.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py index 09b5279..072e14b 100644 --- a/problems/nvidia/eval_better_bench_grouped_gemm.py +++ b/problems/nvidia/eval_better_bench_grouped_gemm.py @@ -242,10 +242,14 @@ def _run_single_benchmark( data_list = [] # generate input data once + local_seed = test.args.get("seed", None) for i in range(NUM_ITERATIONS_PER_BENCHMARK): - if "seed" in test.args: - test.args["seed"] += 42 - data = generate_input(**test.args) + if local_seed is not None: + local_seed += 42 + args = {**test.args, "seed": local_seed} + else: + args = test.args + data = generate_input(**args) data_list.append(data) check_copy = _clone_data(data_list) @@ -272,12 +276,15 @@ def _run_single_benchmark( for i in range(max_repeats): torch.cuda.synchronize() + # Clone data before timing to prevent object-identity caching exploits + iteration_data = _clone_data(data_list) + outputs = [] clear_l2_cache() start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() - for data in data_list: + for data in iteration_data: output = custom_kernel(data) outputs.append(output) end_event.record() From 81d6d4ece2adfa6f12ca6247507816d19940a14f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 6 Feb 2026 19:06:38 -0800 Subject: [PATCH 2/2] Shuffle iteration data and fix recheck bug Additional hardening on top of the object-identity caching fix: - Shuffle data order each timing iteration to prevent call-count caching (a submission could track invocation count and predict which data item appears at each position) - Move clone before torch.cuda.synchronize() so clone GPU copies can overlap with previous iteration's tail work - Fix pre-existing recheck bug where only the last item's correctness was checked (if not good was outside the for loop) - Use shuffle_order indices to correctly pair shuffled outputs with their reference data during recheck --- .../nvidia/eval_better_bench_grouped_gemm.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py index 072e14b..3e9b552 100644 --- a/problems/nvidia/eval_better_bench_grouped_gemm.py +++ b/problems/nvidia/eval_better_bench_grouped_gemm.py @@ -1,6 +1,7 @@ import base64 import dataclasses import multiprocessing +import random import re import time import os @@ -274,10 +275,14 @@ def _run_single_benchmark( bm_start_time = time.perf_counter_ns() for i in range(max_repeats): - torch.cuda.synchronize() - - # Clone data before timing to prevent object-identity caching exploits + # Clone and shuffle data before timing to prevent both + # object-identity caching and call-order caching exploits iteration_data = _clone_data(data_list) + shuffle_order = list(range(len(iteration_data))) + random.shuffle(shuffle_order) + iteration_data = [iteration_data[j] for j in shuffle_order] + + torch.cuda.synchronize() outputs = [] clear_l2_cache() @@ -294,10 +299,10 @@ def _run_single_benchmark( ) * 1e6 # Convert ms to ns if recheck: - for reference_output, custom_output in zip(check_copy, outputs): - good, message = check_implementation(reference_output, custom_output) - if not good: - return message + for j, custom_output in zip(shuffle_order, outputs): + good, message = check_implementation(check_copy[j], custom_output) + if not good: + return message durations.append(duration)