From 118b06a730e20e50a4a30e43881869be30b6b6a5 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 17:56:22 -0800 Subject: [PATCH 1/6] =?UTF-8?q?Rename=20submission=20modes:=20benchmark?= =?UTF-8?q?=E2=86=92private,=20leaderboard=E2=86=92public?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This renames the user-facing submission modes for clarity: - BENCHMARK → PRIVATE (run benchmarks without affecting leaderboard ranking) - LEADERBOARD → PUBLIC (official submission to the public leaderboard) Also adds SECRET mode for internal secret validation runs. Updates Discord commands: /benchmark → /private, /ranked → /public --- src/kernelbot/api/api_utils.py | 4 ++-- src/kernelbot/cogs/leaderboard_cog.py | 16 +++++++-------- src/kernelbot/cogs/verify_run_cog.py | 8 ++++---- src/libkernelbot/backend.py | 22 +++++++++++---------- src/libkernelbot/consts.py | 19 +++++++++--------- src/libkernelbot/launchers/github.py | 4 ++-- src/libkernelbot/report.py | 28 +++++++++++++++------------ src/libkernelbot/run_eval.py | 16 +++++++-------- src/libkernelbot/submission.py | 8 ++++---- tests/test_backend.py | 20 +++++++++---------- tests/test_github.py | 6 +++--- tests/test_modal.py | 10 +++++----- tests/test_task.py | 4 ++-- 13 files changed, 86 insertions(+), 79 deletions(-) diff --git a/src/kernelbot/api/api_utils.py b/src/kernelbot/api/api_utils.py index ab1505ac..1b37a8ef 100644 --- a/src/kernelbot/api/api_utils.py +++ b/src/kernelbot/api/api_utils.py @@ -213,9 +213,9 @@ async def to_submit_info( allowed_modes = [ SubmissionMode.TEST, - SubmissionMode.BENCHMARK, + SubmissionMode.PRIVATE, SubmissionMode.PROFILE, - SubmissionMode.LEADERBOARD, + SubmissionMode.PUBLIC, ] if submission_mode_enum not in allowed_modes: raise HTTPException( diff --git a/src/kernelbot/cogs/leaderboard_cog.py b/src/kernelbot/cogs/leaderboard_cog.py index 457321f3..8d00e471 100644 --- a/src/kernelbot/cogs/leaderboard_cog.py +++ b/src/kernelbot/cogs/leaderboard_cog.py @@ -64,7 +64,7 @@ async def post_submit_hook(self, interaction: discord.Interaction, sub_id: int): for run in sub_data["runs"]: if ( not run["secret"] - and run["mode"] == SubmissionMode.LEADERBOARD.value + and run["mode"] == SubmissionMode.PUBLIC.value and run["passed"] ): result_lines.append(generate_run_verdict(self.bot.backend, run, sub_data)) @@ -134,7 +134,7 @@ async def submit( reporter = MultiProgressReporterDiscord(interaction) sub_id, results = await self.bot.backend.submit_full(req, mode, reporter) - if mode == SubmissionMode.LEADERBOARD: + if mode == SubmissionMode.PUBLIC: await self.post_submit_hook(interaction, sub_id) return sub_id @@ -157,7 +157,7 @@ async def submit_test( interaction, leaderboard_name, script, mode=SubmissionMode.TEST, gpu=gpu ) - @app_commands.command(name="benchmark", description="Start a benchmarking run") + @app_commands.command(name="private", description="Start a private benchmarking run") @app_commands.describe( leaderboard_name="Name of the competition / kernel to optimize", script="The Python / CUDA script file to run", @@ -165,7 +165,7 @@ async def submit_test( ) @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) @with_error_handling - async def submit_bench( + async def submit_private( self, interaction: discord.Interaction, script: discord.Attachment, @@ -173,7 +173,7 @@ async def submit_bench( gpu: Optional[str], ): return await self.submit( - interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu + interaction, leaderboard_name, script, mode=SubmissionMode.PRIVATE, gpu=gpu ) @app_commands.command(name="profile", description="Start a profiling run") @@ -196,7 +196,7 @@ async def submit_profile( ) @app_commands.command( - name="ranked", description="Start a ranked run for an official leaderboard submission" + name="public", description="Start a public run for an official leaderboard submission" ) @app_commands.describe( leaderboard_name="Name of the competition / kernel to optimize", @@ -205,7 +205,7 @@ async def submit_profile( ) @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) @with_error_handling - async def submit_ranked( + async def submit_public( self, interaction: discord.Interaction, script: discord.Attachment, @@ -213,7 +213,7 @@ async def submit_ranked( gpu: Optional[str] = None, ): return await self.submit( - interaction, leaderboard_name, script, mode=SubmissionMode.LEADERBOARD, gpu=gpu + interaction, leaderboard_name, script, mode=SubmissionMode.PUBLIC, gpu=gpu ) diff --git a/src/kernelbot/cogs/verify_run_cog.py b/src/kernelbot/cogs/verify_run_cog.py index 53102682..58ad5844 100644 --- a/src/kernelbot/cogs/verify_run_cog.py +++ b/src/kernelbot/cogs/verify_run_cog.py @@ -171,8 +171,8 @@ async def verify_modal_run( @app_commands.choices( mode=[ Choice(name=SubmissionMode.TEST.name, value=SubmissionMode.TEST.value), - Choice(name=SubmissionMode.BENCHMARK.name, value=SubmissionMode.BENCHMARK.value), - Choice(name=SubmissionMode.LEADERBOARD.name, value=SubmissionMode.LEADERBOARD.value), + Choice(name=SubmissionMode.PRIVATE.name, value=SubmissionMode.PRIVATE.value), + Choice(name=SubmissionMode.PUBLIC.name, value=SubmissionMode.PUBLIC.value), Choice(name="All", value="all"), ] ) @@ -194,9 +194,9 @@ async def verify_task( modes = [] if mode is None: - modes = [SubmissionMode.LEADERBOARD] + modes = [SubmissionMode.PUBLIC] elif mode.value == "all": - modes = [SubmissionMode.TEST, SubmissionMode.BENCHMARK, SubmissionMode.LEADERBOARD] + modes = [SubmissionMode.TEST, SubmissionMode.PRIVATE, SubmissionMode.PUBLIC] else: modes = [SubmissionMode(mode.value)] diff --git a/src/libkernelbot/backend.py b/src/libkernelbot/backend.py index f3b68bb0..2f90e0e3 100644 --- a/src/libkernelbot/backend.py +++ b/src/libkernelbot/backend.py @@ -86,7 +86,7 @@ async def submit_full( for gpu in selected_gpus ] - if mode == SubmissionMode.LEADERBOARD: + if mode == SubmissionMode.PUBLIC: tasks += [ self.submit_leaderboard( sub_id, @@ -95,7 +95,7 @@ async def submit_full( gpu, reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"), req.task, - SubmissionMode.PRIVATE, + SubmissionMode.SECRET, req.secret_seed, ) for gpu in selected_gpus @@ -142,12 +142,14 @@ async def submit_leaderboard( # noqa: C901 if result.success: score = None + # Check for the mode's result key (public or secret) + mode_key = mode.value if ( - "leaderboard" in result.runs - and result.runs["leaderboard"].run.success - and result.runs["leaderboard"].run.passed + mode_key in result.runs + and result.runs[mode_key].run.success + and result.runs[mode_key].run.passed ): - score = compute_score(result, task, submission_id) + score = compute_score(result, task, submission_id, mode_key) # verifyruns uses a fake submission id of -1 if submission_id != -1: @@ -159,8 +161,8 @@ async def submit_leaderboard( # noqa: C901 end=value.end, mode=key, runner=gpu_type.name, - score=None if key != "leaderboard" else score, - secret=mode == SubmissionMode.PRIVATE, + score=None if key != mode_key else score, + secret=mode == SubmissionMode.SECRET, compilation=value.compilation, result=value.run, system=result.system, @@ -207,7 +209,7 @@ async def handle_submission( await reporter.update_title(reporter.title + " ✅ success") short_report = make_short_report( - result.runs, full=mode in [SubmissionMode.PRIVATE, SubmissionMode.LEADERBOARD] + result.runs, full=mode in [SubmissionMode.PUBLIC, SubmissionMode.SECRET] ) stream_msg = ( @@ -222,7 +224,7 @@ async def handle_submission( ) await reporter.push(short_report) - if mode != SubmissionMode.PRIVATE: + if mode != SubmissionMode.SECRET: try: # does the last message of the short report start with ✅ or ❌? verdict = short_report[-1][0] diff --git a/src/libkernelbot/consts.py b/src/libkernelbot/consts.py index f60764de..ac667cce 100644 --- a/src/libkernelbot/consts.py +++ b/src/libkernelbot/consts.py @@ -82,21 +82,22 @@ class SubmissionMode(Enum): """ Different types of submission that can be made: Test: Run tests and give detailed results about passed/failed tests. These have short timeouts. - Benchmark: Run larger benchmarks. Each benchmark is tested once, and then run multiple times. + Private: Run benchmarks privately. Each benchmark is tested once, and then run multiple times. + Returns detailed timing results but doesn't affect leaderboard ranking. Profile: Gather profiling information. One selected benchmark is run under the profiler. No testing is performed in this mode (sometimes, you need to profile deliberately broken code) - Leaderboard: Official submission to the leaderboard. This first runs public tests, then a - repeated invocation of a single benchmark. Feedback for the secret benchmark is only very - limited (no stdout/stderr). - Private: Special run that does test followed by leaderboard (on a secret seed), but gives only - very limited feedback. + Public: Official submission to the leaderboard. This first runs public tests, then a + repeated invocation of a single benchmark. If all tests pass, the submission is evaluated + and ranked on the public leaderboard. + Secret: Internal mode for running the full evaluation flow with a secret seed. This is used + for secret validation runs that accompany public submissions. """ TEST = "test" - BENCHMARK = "benchmark" - PROFILE = "profile" - LEADERBOARD = "leaderboard" PRIVATE = "private" + PROFILE = "profile" + PUBLIC = "public" + SECRET = "secret" class Language(Enum): diff --git a/src/libkernelbot/launchers/github.py b/src/libkernelbot/launchers/github.py index a1970a7e..c984c749 100644 --- a/src/libkernelbot/launchers/github.py +++ b/src/libkernelbot/launchers/github.py @@ -49,8 +49,8 @@ def get_timeout(config: dict) -> int: mode = config.get("mode") sec_map = { SubmissionMode.TEST.value: config.get("test_timeout"), - SubmissionMode.BENCHMARK.value: config.get("benchmark_timeout"), - SubmissionMode.LEADERBOARD.value: config.get("ranked_timeout"), + SubmissionMode.PRIVATE.value: config.get("benchmark_timeout"), + SubmissionMode.PUBLIC.value: config.get("ranked_timeout"), } seconds = sec_map.get(mode) or DEFAULT_GITHUB_TIMEOUT_MINUTES * 60 return math.ceil(seconds / 60) diff --git a/src/libkernelbot/report.py b/src/libkernelbot/report.py index 70f91487..b0f8baf5 100644 --- a/src/libkernelbot/report.py +++ b/src/libkernelbot/report.py @@ -176,8 +176,8 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n elif full: result.append("❌ Tests missing") - if "benchmark" in runs: - bench_run = runs["benchmark"].run + if "private" in runs: + bench_run = runs["private"].run if not bench_run.success: result.append("❌ Running benchmarks failed" + _short_fail_reason(bench_run)) return result @@ -202,16 +202,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n else: result.append("✅ Profiling successful") - if "leaderboard" in runs: - lb_run = runs["leaderboard"].run + # Check for public or secret run results + ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None) + if ranked_key: + lb_run = runs[ranked_key].run if not lb_run.success: - result.append("❌ Running leaderboard failed" + _short_fail_reason(lb_run)) + result.append("❌ Running ranked submission failed" + _short_fail_reason(lb_run)) elif not lb_run.passed: - result.append("❌ Leaderboard run failed") + result.append("❌ Ranked submission failed") else: - result.append("✅ Leaderboard run successful") + result.append("✅ Ranked submission successful") elif full: - result.append("❌ Leaderboard missing") + result.append("❌ Ranked submission missing") return result @@ -339,8 +341,8 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport num_tests = int(test_run.result.get("test-count", 0)) report.add_log(f"✅ Passed {num_tests}/{num_tests} tests", make_test_log(test_run)) - if "benchmark" in runs: - bench_run = runs["benchmark"] + if "private" in runs: + bench_run = runs["private"] if _handle_crash_report(report, bench_run): return report @@ -378,8 +380,10 @@ def generate_report(result: FullResult, extra_text: str = "") -> RunResultReport base64.b64decode(prof_run.profile.trace), ) - if "leaderboard" in runs: - bench_run = runs["leaderboard"] + # Check for public or secret run results + ranked_key = "public" if "public" in runs else ("secret" if "secret" in runs else None) + if ranked_key: + bench_run = runs[ranked_key] if _handle_crash_report(report, bench_run): return report diff --git a/src/libkernelbot/run_eval.py b/src/libkernelbot/run_eval.py index aec59f95..5891f302 100644 --- a/src/libkernelbot/run_eval.py +++ b/src/libkernelbot/run_eval.py @@ -556,8 +556,8 @@ def run_single_evaluation( if mode == "test": timeout = test_timeout cases.write(tests) - elif mode in ["benchmark", "profile", "leaderboard"]: - timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout + elif mode in ["private", "profile", "public", "secret"]: + timeout = ranked_timeout if mode in ["public", "secret"] else benchmark_timeout if ranking_by == "last": cases.write(benchmarks.splitlines(keepends=True)[-1]) else: @@ -801,22 +801,22 @@ def run_evaluation( common_args["benchmarks"] = benchmark results[f"{mode}.{i}"] = call(mode=mode, **common_args) - elif mode in ["test", "benchmark"]: + elif mode in ["test", "private"]: results[mode] = call(mode=mode, **common_args) - elif mode in ["private", "leaderboard"]: + elif mode in ["public", "secret"]: # first, run the tests results["test"] = call(mode="test", **common_args) if not results["test"].run or not results["test"].run.passed: return results - results["benchmark"] = call(mode="benchmark", **common_args) + results["private"] = call(mode="private", **common_args) - if not results["benchmark"].run or not results["benchmark"].run.passed: + if not results["private"].run or not results["private"].run.passed: return results - # if they pass, run the leaderboard validation - results["leaderboard"] = call(mode="leaderboard", **common_args) + # if they pass, run the public/secret validation + results[mode] = call(mode=mode, **common_args) else: raise AssertionError("Invalid mode") diff --git a/src/libkernelbot/submission.py b/src/libkernelbot/submission.py index 805f7435..12cbc661 100644 --- a/src/libkernelbot/submission.py +++ b/src/libkernelbot/submission.py @@ -169,8 +169,8 @@ def _get_popcorn_directives(submission: str) -> dict: # noqa: C901 return popcorn_info -def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) -> float: - num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"]) +def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int, mode_key: str = "public") -> float: + num_benchmarks = int(result.runs[mode_key].run.result["benchmark-count"]) if task.ranking_by == RankCriterion.LAST: if num_benchmarks != 1: logger.error( @@ -182,11 +182,11 @@ def compute_score(result: FullResult, task: LeaderboardTask, submission_id: int) raise KernelBotError( f"Expected submission to have exactly one benchmark, got {num_benchmarks}." ) - score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9 + score = float(result.runs[mode_key].run.result["benchmark.0.mean"]) / 1e9 else: scores = [] for i in range(num_benchmarks): - scores.append(float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) / 1e9) + scores.append(float(result.runs[mode_key].run.result[f"benchmark.{i}.mean"]) / 1e9) if task.ranking_by == RankCriterion.MEAN: score = sum(scores) / len(scores) elif task.ranking_by == RankCriterion.GEOM: diff --git a/tests/test_backend.py b/tests/test_backend.py index f69170c5..af327519 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -55,7 +55,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory): "pass", "submit.py", task, - consts.SubmissionMode.LEADERBOARD, + consts.SubmissionMode.PUBLIC, -1, ) @@ -64,7 +64,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory): "> ✅ Compilation successful", "> ✅ Testing successful", "> ❌ Benchmarks missing", - "> ❌ Leaderboard missing", + "> ❌ Ranked submission missing", ] call_args = reporter.display_report.call_args[0] @@ -130,7 +130,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): submit_time, ) eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"leaderboard": eval_result}) + mock_launcher = _mock_launcher(bot, {"secret": eval_result}) reporter = MockProgressReporter("report") @@ -141,7 +141,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): consts.ModalGPU.A100, reporter, task, - consts.SubmissionMode.LEADERBOARD, + consts.SubmissionMode.SECRET, seed=1337, ) @@ -155,7 +155,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): "benchmarks": [{"dtype": "float32", "input_size": 10000}], "lang": "py", "main": "kernel.py", - "mode": "leaderboard", + "mode": "secret", "multi_gpu": False, "ranked_timeout": 180, "ranking_by": "geom", @@ -193,7 +193,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "leaderboard", + "mode": "secret", "passed": True, "result": { "benchmark-count": "1", @@ -206,7 +206,7 @@ async def test_submit_leaderboard(bot: backend.KernelBackend, task_directory): }, "runner": "A100", "score": Decimal("1.5e-9"), - "secret": False, + "secret": True, "start_time": eval_result.start.replace(tzinfo=datetime.timezone.utc), "system": { "cpu": "Intel i9-12900K", @@ -249,7 +249,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): ) reporter = MockMultReporter() s_id, results = await bot.submit_full( - req, mode=consts.SubmissionMode.LEADERBOARD, reporter=reporter + req, mode=consts.SubmissionMode.PUBLIC, reporter=reporter ) expected_result = mock_launcher.run_submission.return_value @@ -261,13 +261,13 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "> ✅ Compilation successful", "> ❌ Tests missing", "> ❌ Benchmarks missing", - "> ✅ Leaderboard run successful", + "> ✅ Ranked submission successful", ] assert r2.lines == [ "> ✅ Compilation successful", "> ❌ Tests missing", "> ❌ Benchmarks missing", - "> ✅ Leaderboard run successful", + "> ✅ Ranked submission successful", ] assert r1.title == "A100 on Modal ✅ success" assert r2.title == "A100 on Modal (secret) ✅ success" diff --git a/tests/test_github.py b/tests/test_github.py index 413e00bd..8b8a4d63 100644 --- a/tests/test_github.py +++ b/tests/test_github.py @@ -179,7 +179,7 @@ async def test_github_launcher_failing_script(project_root: Path, github_config: task=task_definition.task, submission_content=submission_content, arch=0, - mode=SubmissionMode.LEADERBOARD, + mode=SubmissionMode.PUBLIC, ) result = await launcher.run_submission(config, gpu_type, reporter) @@ -190,9 +190,9 @@ async def test_github_launcher_failing_script(project_root: Path, github_config: # But the actual test or benchmark should fail test_passed = result.runs.get("test", {}).run.passed if "test" in result.runs else True - benchmark_passed = result.runs.get("benchmark", {}).run.passed if "benchmark" in result.runs else True + private_passed = result.runs.get("private", {}).run.passed if "private" in result.runs else True - assert not (test_passed and benchmark_passed), "Expected at least one run to fail for cheating script" + assert not (test_passed and private_passed), "Expected at least one run to fail for cheating script" diff --git a/tests/test_modal.py b/tests/test_modal.py index d22ef05b..c87bd2c7 100644 --- a/tests/test_modal.py +++ b/tests/test_modal.py @@ -265,7 +265,7 @@ async def test_modal_multi_gpu_benchmark( task=task_definition.task, submission_content=submission_content, arch=GPU_TO_SM[ModalGPU.L4x4.name], - mode=SubmissionMode.BENCHMARK, + mode=SubmissionMode.PRIVATE, ) result = await launcher.run_submission(config, ModalGPU.L4x4, reporter) @@ -280,8 +280,8 @@ async def test_modal_multi_gpu_benchmark( assert result.system.device_count == 4 # Test run structure - assert "benchmark" in result.runs - bench_run = result.runs["benchmark"] + assert "private" in result.runs + bench_run = result.runs["private"] # For Python runs, compilation is None assert bench_run.compilation is None @@ -317,7 +317,7 @@ async def test_modal_launcher_failing_script(modal_deployment, project_root: Pat task=task_definition.task, submission_content=submission_content, arch=GPU_TO_SM[gpu_type.name], - mode=SubmissionMode.LEADERBOARD, + mode=SubmissionMode.PUBLIC, ) result = await launcher.run_submission(config, gpu_type, reporter) @@ -325,4 +325,4 @@ async def test_modal_launcher_failing_script(modal_deployment, project_root: Pat # Basic structure and success assert result.success, f"Expected successful run, got: {result.error}" assert result.error == "" - assert result.runs["test"].run.passed is False or result.runs["benchmark"].run.passed is False + assert result.runs["test"].run.passed is False or result.runs["private"].run.passed is False diff --git a/tests/test_task.py b/tests/test_task.py index 809a6907..b6fa6a63 100644 --- a/tests/test_task.py +++ b/tests/test_task.py @@ -126,7 +126,7 @@ def test_build_task_config_python(leaderboard_task): """Test build_task_config with Python task and submission content.""" submission_content = "print('Hello World')" arch = "sm_80" - mode = SubmissionMode.BENCHMARK + mode = SubmissionMode.PRIVATE result = build_task_config( task=leaderboard_task, submission_content=submission_content, arch=arch, mode=mode @@ -164,7 +164,7 @@ def test_build_task_config_cuda(): """Test build_task_config with CUDA task and submission content.""" submission_content = "print('Hello World')" arch = "sm_80" - mode = SubmissionMode.BENCHMARK + mode = SubmissionMode.PRIVATE task = LeaderboardTask( lang=Language.CUDA, files={"test.cu": "code", "submission.cu": "@SUBMISSION@", "test.cuh": "header"}, From ee62fdc798c4b6ba8a28d854a7ff3a7bed31f4b9 Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:12:10 -0800 Subject: [PATCH 2/6] Fix test files to use new private/public mode naming Update test data keys and expected values: - test_report.py: Change "benchmark"/"leaderboard" keys to "private"/"public" - test_submission.py: Update compute_score test to use "public" key - test_backend.py: Update mode values and mock data keys --- tests/test_backend.py | 8 ++++---- tests/test_report.py | 30 +++++++++++++++--------------- tests/test_submission.py | 8 ++++---- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index af327519..04d0fde4 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -101,7 +101,7 @@ async def test_handle_submission(bot: backend.KernelBackend, task_directory): "benchmarks": [{"dtype": "float32", "input_size": 10000}], "lang": "py", "main": "kernel.py", - "mode": "leaderboard", + "mode": "public", "multi_gpu": False, "ranked_timeout": 180, "ranking_by": "geom", @@ -232,7 +232,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): task = db.get_leaderboard("submit-leaderboard")["task"] eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"leaderboard": eval_result}) + mock_launcher = _mock_launcher(bot, {"public": eval_result}) from libkernelbot.submission import ProcessedSubmissionRequest @@ -300,7 +300,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "leaderboard", + "mode": "public", "passed": True, "result": { "benchmark-count": "1", @@ -344,7 +344,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "leaderboard", + "mode": "public", "passed": True, "result": { "benchmark-count": "1", diff --git a/tests/test_report.py b/tests/test_report.py index ae3afd25..f8a61b29 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -241,7 +241,7 @@ def test_make_short_report_benchmarking_failed(sample_eval_result: EvalResult): sample_eval_result.run.success = False sample_eval_result.compilation = None sample_eval_result.run.exit_code = consts.ExitCode.CUDA_FAIL - runs = {"benchmark": sample_eval_result} + runs = {"private": sample_eval_result} result = make_short_report(runs, full=False) assert result == ["❌ Running benchmarks failed (cuda api error)"] @@ -274,27 +274,27 @@ def test_make_short_report_leaderboard_failed(sample_eval_result: EvalResult): sample_eval_result.run.success = False sample_eval_result.compilation = None sample_eval_result.run.exit_code = consts.ExitCode.TEST_SPEC - runs = {"leaderboard": sample_eval_result} + runs = {"public": sample_eval_result} result = make_short_report(runs, full=False) - assert result == ["❌ Running leaderboard failed (internal error 113)"] + assert result == ["❌ Running ranked submission failed (internal error 113)"] sample_eval_result.run.success = True sample_eval_result.run.passed = False sample_eval_result.run.exit_code = consts.ExitCode.VALIDATE_FAIL result = make_short_report(runs) # TODO is this actually possible? Should profiling do **any** correctness testing? - assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard run failed"] + assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission failed"] def test_make_short_report_empty(): result = make_short_report({}) - assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Leaderboard missing"] + assert result == ["❌ Tests missing", "❌ Benchmarks missing", "❌ Ranked submission missing"] def test_make_short_report_full_success(): runs = {} - for run_type in ["test", "benchmark", "profile", "leaderboard"]: + for run_type in ["test", "private", "profile", "public"]: runs[run_type] = EvalResult( start=datetime.datetime.now() - datetime.timedelta(minutes=5), end=datetime.datetime.now(), @@ -318,7 +318,7 @@ def test_make_short_report_full_success(): "✅ Testing successful", "✅ Benchmarking successful", "✅ Profiling successful", - "✅ Leaderboard run successful", + "✅ Ranked submission successful", ] assert result == expected @@ -331,7 +331,7 @@ def test_make_short_report_missing_components(): "✅ Compilation successful", "✅ Testing successful", "❌ Benchmarks missing", - "❌ Leaderboard missing", + "❌ Ranked submission missing", ] assert result == expected @@ -532,7 +532,7 @@ def test_generate_report_test_failure(sample_full_result: FullResult): def test_generate_report_benchmark_failure(sample_full_result: FullResult): from libkernelbot.report import Log, Text - sample_full_result.runs["benchmark"] = create_eval_result() + sample_full_result.runs["private"] = create_eval_result() report = generate_report(sample_full_result) assert report.data == [ Text( @@ -557,8 +557,8 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult): Log(header="Benchmarks", content="❗ Could not find any benchmarks"), ] - sample_full_result.runs["benchmark"].run.passed = False - sample_full_result.runs["benchmark"].run.result = { + sample_full_result.runs["private"].run.passed = False + sample_full_result.runs["private"].run.result = { "benchmark-count": "2", "benchmark.0.status": "pass", "benchmark.0.spec": "Basic functionality", @@ -607,7 +607,7 @@ def test_generate_report_benchmark_failure(sample_full_result: FullResult): def test_generate_report_leaderboard_failure(sample_full_result: FullResult): from libkernelbot.report import Log, Text - sample_full_result.runs["leaderboard"] = create_eval_result() + sample_full_result.runs["public"] = create_eval_result() report = generate_report(sample_full_result) assert report.data == [ Text( @@ -632,9 +632,9 @@ def test_generate_report_leaderboard_failure(sample_full_result: FullResult): Log(header="Ranked Benchmark", content="❗ Could not find any benchmarks"), ] - sample_full_result.runs["leaderboard"].run.success = False - sample_full_result.runs["leaderboard"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED - sample_full_result.runs["leaderboard"].run.duration = 10.0 + sample_full_result.runs["public"].run.success = False + sample_full_result.runs["public"].run.exit_code = consts.ExitCode.TIMEOUT_EXPIRED + sample_full_result.runs["public"].run.duration = 10.0 report = generate_report(sample_full_result) assert report.data == [ diff --git a/tests/test_submission.py b/tests/test_submission.py index e22fcb8e..1654b9b1 100644 --- a/tests/test_submission.py +++ b/tests/test_submission.py @@ -303,7 +303,7 @@ def test_compute_score(): # Test LAST ranking with single benchmark mock_task.ranking_by = RankCriterion.LAST mock_result.runs = { - "leaderboard": mock.Mock( + "public": mock.Mock( run=mock.Mock( result={ "benchmark-count": "1", @@ -317,7 +317,7 @@ def test_compute_score(): # Test MEAN ranking with multiple benchmarks mock_task.ranking_by = RankCriterion.MEAN - mock_result.runs["leaderboard"].run.result = { + mock_result.runs["public"].run.result = { "benchmark-count": "2", "benchmark.0.mean": "1000000000", # 1 second "benchmark.1.mean": "3000000000", # 3 seconds @@ -327,7 +327,7 @@ def test_compute_score(): # Test GEOM ranking with multiple benchmarks mock_task.ranking_by = RankCriterion.GEOM - mock_result.runs["leaderboard"].run.result = { + mock_result.runs["public"].run.result = { "benchmark-count": "2", "benchmark.0.mean": "4000000000", # 4 seconds "benchmark.1.mean": "9000000000", # 9 seconds @@ -337,7 +337,7 @@ def test_compute_score(): # Test LAST with multiple benchmarks (should raise error) mock_task.ranking_by = RankCriterion.LAST - mock_result.runs["leaderboard"].run.result["benchmark-count"] = "2" + mock_result.runs["public"].run.result["benchmark-count"] = "2" with pytest.raises(KernelBotError, match="exactly one benchmark"): submission.compute_score(mock_result, mock_task, 1) From 6fcc19f17e4ecfe8a618524047f5f0476d7cd56c Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:15:57 -0800 Subject: [PATCH 3/6] Fix test_submit_full mock and expected mode - Add 'secret' key to mock launcher runs so SECRET mode can find its result - Fix second run's expected mode from 'public' to 'secret' --- tests/test_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index 04d0fde4..ea420dbd 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -232,7 +232,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): task = db.get_leaderboard("submit-leaderboard")["task"] eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"public": eval_result}) + mock_launcher = _mock_launcher(bot, {"public": eval_result, "secret": eval_result}) from libkernelbot.submission import ProcessedSubmissionRequest @@ -344,7 +344,7 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): "stdout": "log stdout", "success": True, }, - "mode": "public", + "mode": "secret", "passed": True, "result": { "benchmark-count": "1", From 2e022cadeb6ca39c994ad00fa9c619458431f20e Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:19:04 -0800 Subject: [PATCH 4/6] Fix GitHub integration test to use PR branch Set GITHUB_BRANCH env var to use the PR's source branch instead of falling back to main. Uses github.head_ref for PRs, github.ref_name for direct pushes. --- .github/workflows/testing.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml index c34818f2..667b2c4c 100644 --- a/.github/workflows/testing.yml +++ b/.github/workflows/testing.yml @@ -58,6 +58,7 @@ jobs: if: github.actor != 'dependabot[bot]' env: GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + GITHUB_BRANCH: ${{ github.head_ref || github.ref_name }} steps: - uses: actions/checkout@v4 - uses: astral-sh/setup-uv@v4 From d47bad7ee9c02b9261cb42d0d1a014b736ce044c Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Sun, 1 Feb 2026 18:21:53 -0800 Subject: [PATCH 5/6] Fix test_submit_full mock to return mode-specific results Use side_effect to return different FullResult for each call: - First call (PUBLIC mode) returns {"public": eval_result} - Second call (SECRET mode) returns {"secret": eval_result} This prevents the backend from storing all keys from both calls. --- tests/test_backend.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/test_backend.py b/tests/test_backend.py index ea420dbd..a07b1514 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -232,7 +232,18 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): task = db.get_leaderboard("submit-leaderboard")["task"] eval_result = create_eval_result("benchmark") - mock_launcher = _mock_launcher(bot, {"public": eval_result, "secret": eval_result}) + # Use side_effect to return different results for each call + # First call (PUBLIC mode) returns {"public": ...}, second call (SECRET mode) returns {"secret": ...} + mock_launcher = MagicMock(spec=backend.Launcher) + mock_launcher.name = "launcher" + mock_launcher.gpus = [consts.ModalGPU.A100] + mock_launcher.run_submission = AsyncMock( + side_effect=[ + FullResult(success=True, error="", system=sample_system_info(), runs={"public": eval_result}), + FullResult(success=True, error="", system=sample_system_info(), runs={"secret": eval_result}), + ] + ) + bot.register_launcher(mock_launcher) from libkernelbot.submission import ProcessedSubmissionRequest @@ -252,9 +263,15 @@ async def test_submit_full(bot: backend.KernelBackend, task_directory): req, mode=consts.SubmissionMode.PUBLIC, reporter=reporter ) - expected_result = mock_launcher.run_submission.return_value + expected_result_public = FullResult( + success=True, error="", system=sample_system_info(), runs={"public": eval_result} + ) + expected_result_secret = FullResult( + success=True, error="", system=sample_system_info(), runs={"secret": eval_result} + ) assert len(results) == 2 - assert results == [expected_result, expected_result] + assert results[0].success == expected_result_public.success + assert results[1].success == expected_result_secret.success r1, r2 = reporter.reporter_list assert r1.lines == [ From a002f08dfad0ffe236bbdc485667103f50c5a74f Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Fri, 6 Feb 2026 08:35:44 -0800 Subject: [PATCH 6/6] Fix PRIVATE mode submissions incorrectly receiving scores PRIVATE mode runs return timing info but should not affect leaderboard ranking. This change restricts score computation to only PUBLIC and SECRET modes, ensuring PRIVATE submissions have score=None as intended. --- src/libkernelbot/backend.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/libkernelbot/backend.py b/src/libkernelbot/backend.py index 2f90e0e3..2c442dff 100644 --- a/src/libkernelbot/backend.py +++ b/src/libkernelbot/backend.py @@ -142,14 +142,16 @@ async def submit_leaderboard( # noqa: C901 if result.success: score = None - # Check for the mode's result key (public or secret) mode_key = mode.value - if ( - mode_key in result.runs - and result.runs[mode_key].run.success - and result.runs[mode_key].run.passed - ): - score = compute_score(result, task, submission_id, mode_key) + # Only compute scores for ranked modes (PUBLIC and SECRET), not PRIVATE + # PRIVATE runs return timing info but don't affect leaderboard ranking + if mode in (SubmissionMode.PUBLIC, SubmissionMode.SECRET): + if ( + mode_key in result.runs + and result.runs[mode_key].run.success + and result.runs[mode_key].run.passed + ): + score = compute_score(result, task, submission_id, mode_key) # verifyruns uses a fake submission id of -1 if submission_id != -1: