From 583bf088b277f2cd9977f5a3704359c9356d96fa Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 6 Feb 2026 16:01:03 -0800
Subject: [PATCH 1/2] Fix benchmark exploit via object-identity caching

The benchmark harness was vulnerable to submissions that cache results
based on Python object identity (e.g., id(tensor)). Since the same
data objects were reused across all timing iterations, a submission
could cache on first call and return cached results on subsequent
calls, showing artificial speedups of 12-36%.

Changes:
- Clone data before each timing iteration (outside the timed region)
  to give each iteration fresh object identities while not affecting
  measured kernel time
- Use local seed variable instead of mutating test.args["seed"] to
  avoid shared mutable state between benchmark runs
---
 problems/nvidia/eval_better_bench_grouped_gemm.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py
index 09b5279..072e14b 100644
--- a/problems/nvidia/eval_better_bench_grouped_gemm.py
+++ b/problems/nvidia/eval_better_bench_grouped_gemm.py
@@ -242,10 +242,14 @@ def _run_single_benchmark(
     data_list = []
     # generate input data once
 
+    local_seed = test.args.get("seed", None)
     for i in range(NUM_ITERATIONS_PER_BENCHMARK):
-        if "seed" in test.args:
-            test.args["seed"] += 42
-        data = generate_input(**test.args)
+        if local_seed is not None:
+            local_seed += 42
+            args = {**test.args, "seed": local_seed}
+        else:
+            args = test.args
+        data = generate_input(**args)
         data_list.append(data)
 
     check_copy = _clone_data(data_list)
@@ -272,12 +276,15 @@ def _run_single_benchmark(
     for i in range(max_repeats):
         torch.cuda.synchronize()
 
+        # Clone data before timing to prevent object-identity caching exploits
+        iteration_data = _clone_data(data_list)
+
         outputs = []
         clear_l2_cache()
         start_event = torch.cuda.Event(enable_timing=True)
         end_event = torch.cuda.Event(enable_timing=True)
         start_event.record()
-        for data in data_list:
+        for data in iteration_data:
             output = custom_kernel(data)
             outputs.append(output)
         end_event.record()

From 81d6d4ece2adfa6f12ca6247507816d19940a14f Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@meta.com>
Date: Fri, 6 Feb 2026 19:06:38 -0800
Subject: [PATCH 2/2] Shuffle iteration data and fix recheck bug

Additional hardening on top of the object-identity caching fix:

- Shuffle data order each timing iteration to prevent call-count
  caching (a submission could track invocation count and predict
  which data item appears at each position)
- Move clone before torch.cuda.synchronize() so clone GPU copies
  can overlap with previous iteration's tail work
- Fix pre-existing recheck bug where only the last item's
  correctness was checked (if not good was outside the for loop)
- Use shuffle_order indices to correctly pair shuffled outputs
  with their reference data during recheck
---
 .../nvidia/eval_better_bench_grouped_gemm.py  | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/problems/nvidia/eval_better_bench_grouped_gemm.py b/problems/nvidia/eval_better_bench_grouped_gemm.py
index 072e14b..3e9b552 100644
--- a/problems/nvidia/eval_better_bench_grouped_gemm.py
+++ b/problems/nvidia/eval_better_bench_grouped_gemm.py
@@ -1,6 +1,7 @@
 import base64
 import dataclasses
 import multiprocessing
+import random
 import re
 import time
 import os
@@ -274,10 +275,14 @@ def _run_single_benchmark(
 
     bm_start_time = time.perf_counter_ns()
     for i in range(max_repeats):
-        torch.cuda.synchronize()
-
-        # Clone data before timing to prevent object-identity caching exploits
+        # Clone and shuffle data before timing to prevent both
+        # object-identity caching and call-order caching exploits
         iteration_data = _clone_data(data_list)
+        shuffle_order = list(range(len(iteration_data)))
+        random.shuffle(shuffle_order)
+        iteration_data = [iteration_data[j] for j in shuffle_order]
+
+        torch.cuda.synchronize()
 
         outputs = []
         clear_l2_cache()
@@ -294,10 +299,10 @@ def _run_single_benchmark(
         ) * 1e6  # Convert ms to ns
 
         if recheck:
-            for reference_output, custom_output in zip(check_copy, outputs):
-                good, message = check_implementation(reference_output, custom_output)
-            if not good:
-                return message
+            for j, custom_output in zip(shuffle_order, outputs):
+                good, message = check_implementation(check_copy[j], custom_output)
+                if not good:
+                    return message
 
         durations.append(duration)