From 2c1f263756d6c323ab9a22f6a9318a0c126966b7 Mon Sep 17 00:00:00 2001 From: Nathanael Huffman Date: Wed, 6 May 2026 09:18:37 -0400 Subject: [PATCH 1/6] Add testbenches that fail for 4 possible bugs --- hdl/projects/cosmo_seq/dimms_subsystem/BUCK | 9 +- .../dimms_subsystem/sims/dimm_arb_mux_tb.vhd | 195 ++++++++++++++++++ .../dimms_subsystem/sims/spd_proxy_top_tb.vhd | 137 +++++++++++- 3 files changed, 339 insertions(+), 2 deletions(-) create mode 100644 hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/BUCK b/hdl/projects/cosmo_seq/dimms_subsystem/BUCK index f8e645ef..51b5942f 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/BUCK +++ b/hdl/projects/cosmo_seq/dimms_subsystem/BUCK @@ -56,7 +56,7 @@ vunit_sim( vunit_sim( name = "spd_proxy_top_tb", - srcs = glob(["sims/*.vhd"], exclude = ["sims/*tb_pkg.vhd"]), + srcs = glob(["sims/*.vhd"], exclude = ["sims/*tb_pkg.vhd", "sims/dimm_arb_mux_tb.vhd"]), deps = [ ":dimms_subsystem_top", ":spd_shared_sim_pkg", @@ -66,4 +66,11 @@ vunit_sim( "//hdl/ip/vhd/vunit_components:i2c_controller_vc" ], visibility = ['PUBLIC'], +) + +vunit_sim( + name = "dimm_arb_mux_tb", + srcs = ["sims/dimm_arb_mux_tb.vhd"], + deps = [":dimms_subsystem_top"], + visibility = ['PUBLIC'], ) \ No newline at end of file diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd new file mode 100644 index 00000000..ff61ace3 --- /dev/null +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd @@ -0,0 +1,195 @@ +-- This Source Code Form is subject to the terms of the Mozilla Public +-- License, v. 2.0. If a copy of the MPL was not distributed with this +-- file, You can obtain one at https://mozilla.org/MPL/2.0/. +-- +-- Copyright 2025 Oxide Computer Company + +-- Unit-level testbench for dimm_arb_mux. +-- +-- Bug 1 cannot be triggered at the integration level: the FPGA abort always finishes +-- before the CPU's first SCL falling edge (~2-3 µs vs the 5 µs STANDARD half-period), +-- so PLAY_STORED_START is entered while sample_r.state=SAMPLE_START and Bug 3 fires +-- instead. Direct stimulus control lets us reach PLAY_STORED_DATA with bit_count=1 +-- while keeping cpu_scl_in='1' through the catch-up rising edge. + +library ieee; +use ieee.std_logic_1164.all; + +library vunit_lib; + context vunit_lib.vunit_context; + +use work.i2c_common_pkg.all; +use work.tristate_if_pkg.all; + +entity dimm_arb_mux_tb is + generic (runner_cfg : string); +end entity; + +architecture tb of dimm_arb_mux_tb is + constant CLK_PER_NS : positive := 8; + constant I2C_MODE : mode_t := FAST_PLUS; + + signal clk : std_logic := '0'; + signal reset : std_logic := '1'; + + signal cpu_scl_in : std_logic := '1'; + signal cpu_scl_fedge : std_logic := '0'; + signal cpu_scl_redge : std_logic := '0'; + signal cpu_sda_in : std_logic := '1'; + signal cpu_sda_fedge : std_logic := '0'; + signal cpu_sda_redge : std_logic := '0'; + + -- dimm_scl/sda are the inputs the bus_idle_monitor watches. + -- Keep them both high so dimm_i2c_idle asserts quickly. + signal dimm_sda : std_logic := '1'; + signal dimm_scl : std_logic := '1'; + + signal bus_request : std_logic; + signal bus_grant : std_logic := '0'; + signal fpga_i2c_grant : std_logic := '0'; + signal fpga_i2c_abort_or_finish : std_logic; + + signal playback_sda_if : tristate; + signal playback_scl_if : tristate; + + signal fpga_i2c_has_bus : std_logic; + signal sp5_playback_i2c_has_bus : std_logic; + signal forced_idle_delay : std_logic; + signal sp5_i2c_has_bus : std_logic; +begin + + clk <= not clk after 4 ns; + + -- Provide idle bus feedback to the DUT (not functionally used by dimm_arb_mux + -- for these ports, but must be driven to avoid 'U' in the model). + playback_sda_if.i <= '1'; + playback_scl_if.i <= '1'; + + DUT: entity work.dimm_arb_mux + generic map ( + CLK_PER_NS => CLK_PER_NS, + I2C_MODE => I2C_MODE + ) + port map ( + clk => clk, + reset => reset, + in_a0 => '1', + cpu_scl_in => cpu_scl_in, + cpu_scl_fedge => cpu_scl_fedge, + cpu_scl_redge => cpu_scl_redge, + cpu_sda_in => cpu_sda_in, + cpu_sda_fedge => cpu_sda_fedge, + cpu_sda_redge => cpu_sda_redge, + dimm_sda => dimm_sda, + dimm_scl => dimm_scl, + bus_request => bus_request, + bus_grant => bus_grant, + fpga_i2c_grant => fpga_i2c_grant, + fpga_i2c_abort_or_finish => fpga_i2c_abort_or_finish, + playback_sda => playback_sda_if, + playback_scl => playback_scl_if, + fpga_i2c_has_bus => fpga_i2c_has_bus, + sp5_playback_i2c_has_bus => sp5_playback_i2c_has_bus, + forced_idle_delay => forced_idle_delay, + sp5_i2c_has_bus => sp5_i2c_has_bus + ); + + -- Once the intentional start condition has been generated (the first SDA change + -- while the playback SCL is high), no further SDA transition is legal while SCL + -- is still high. Any such change is a spurious start or stop on the DIMM bus. + sda_glitch_monitor: process + variable start_seen : boolean := false; + begin + loop + wait on playback_sda_if.oe; + if sp5_playback_i2c_has_bus = '1' and playback_scl_if.o = '1' then + if not start_seen then + start_seen := true; + else + check_false(true, + "Bug 1: SDA changed while playback SCL was high -- " & + "spurious start/stop condition on DIMM bus"); + end if; + end if; + end loop; + wait; + end process; + + bench: process + procedure clk_tick is begin wait until rising_edge(clk); end procedure; + begin + test_runner_setup(runner, runner_cfg); + + while test_suite loop + if run("arb_bug1_sda_glitch_during_playback") then + -- Release reset, wait for POWER_UP_CLEAR (~9 × 1000 ns at FAST_PLUS) + -- and for dimm_i2c_idle to assert (~504 ns with both dimm lines held high). + reset <= '1'; + clk_tick; + reset <= '0'; + wait for 10 us; + + -- Step 1: CPU start condition — SDA falls while SCL is high. + -- cpu_start_detected fires → mux: IDLE → CPU_REQ_GRANT. + -- sample: SAMPLE_IDLE → SAMPLE_START. + cpu_scl_in <= '1'; + cpu_sda_in <= '0'; + cpu_sda_fedge <= '1'; + clk_tick; + cpu_sda_fedge <= '0'; + clk_tick; + + -- Step 2: CPU SCL falling edge. + -- sample: SAMPLE_START → SAMPLE_DATA (bit_count still 0). + cpu_scl_in <= '0'; + cpu_scl_fedge <= '1'; + clk_tick; + cpu_scl_fedge <= '0'; + clk_tick; + + -- Step 3: CPU SCL rising edge; sample first bit (sda_in='0'). + -- sample: bit_count becomes 1, data_bits(0)='0'. + cpu_scl_in <= '1'; + cpu_scl_redge <= '1'; + clk_tick; + cpu_scl_redge <= '0'; + clk_tick; + + -- Step 4: Grant the bus. + -- sample_r.state=SAMPLE_DATA (not SAMPLE_START) → Bug 3 cannot fire + -- even though cpu_scl_in='1'. + -- mux: CPU_REQ_GRANT → PLAY_STORED_START (dimm_i2c_idle='1' already). + bus_grant <= '1'; + wait until sp5_playback_i2c_has_bus = '1'; + + -- Step 5: Hold cpu_scl_in='1' and set cpu_sda_in='1' so the catch-up + -- REDGE produces a visible SDA change. + -- + -- data_bits(0)='0' → playback currently drives SDA low (oe='1'). + -- Bug 1 will set oe := not(cpu_sda_in) = not('1') = '0' on the catch-up + -- REDGE, releasing SDA while the playback SCL is still high. + -- + -- Timeline from PLAY_STORED_START entry (playback SCL starts at '1'): + -- T + ~500 ns : first playback FEDGE → enter PLAY_STORED_DATA, + -- dimm_sda_oe set to not(data_bits(0))='1' (no change) + -- T + ~1000 ns: first playback REDGE, playback_bits=1=bit_count, + -- cpu_scl_in='1' → Bug 1 exits to ENSURE_PLAYBACK_HOLD, + -- schedules dimm_sda_oe='0' for next cycle + -- T + ~1008 ns: playback_sda_if.oe '1'→'0' while playback_scl_if.o='1' + -- → sda_glitch_monitor fires check_false → test fails + cpu_sda_in <= '1'; + wait for 2 us; + + bus_grant <= '0'; + wait for 500 ns; + end if; + end loop; + + wait for 500 ns; + test_runner_cleanup(runner); + wait; + end process; + + test_runner_watchdog(runner, 1 ms); + +end tb; diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd index 14210c82..ae3d25f1 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd @@ -32,13 +32,55 @@ entity spd_proxy_top_tb is end entity; architecture tb of spd_proxy_top_tb is - + -- Enable flag for the Bug 1 SDA glitch monitor process. + signal check_sda_glitch : std_logic := '0'; begin th: entity work.spd_proxy_top_th; + -- Bug 1 monitor: after the intentional start condition generated by the FPGA playback + -- mux, SDA must not change while the playback SCL is high. Any such change is an + -- accidental start or stop condition on the DIMM bus. + -- + -- The root cause is the PLAY_STORED_DATA → ENSURE_PLAYBACK_HOLD transition on a + -- playback rising edge: the registered SDA update lands one cycle after the rising + -- edge, i.e. while SCL is high. + sda_glitch_monitor: process + alias dimm_scl is << signal th.dimm_abcdef_scl : std_logic >>; + alias dimm_sda is << signal th.dimm_abcdef_sda : std_logic >>; + alias playback_active is + << signal th.DUT.proxy_channel_top_bus0.sp5_playback_i2c_has_bus : std_logic >>; + variable start_seen : boolean; + begin + monitor_loop: loop + wait until check_sda_glitch = '1'; + start_seen := false; + inner: loop + wait on dimm_sda, dimm_scl, check_sda_glitch, playback_active; + exit inner when check_sda_glitch = '0'; + -- Only care about SDA events while the playback mux owns the bus. + if dimm_sda'event and to_x01(dimm_scl) = '1' and playback_active = '1' then + if not start_seen then + -- First occurrence is the intended start condition (SDA falls while SCL high). + start_seen := true; + else + check_false(true, + "Bug 1: SDA changed while playback SCL was high after start condition " & + "-- spurious start/stop on DIMM bus"); + end if; + end if; + end loop inner; + end loop monitor_loop; + wait; + end process; + bench: process alias reset is << signal th.reset : std_logic >>; + -- Hierarchical aliases used by Bug 2/3/4 test cases. + alias playback_active is + << signal th.DUT.proxy_channel_top_bus0.sp5_playback_i2c_has_bus : std_logic >>; + alias cpu_has_bus_sig is + << signal th.DUT.proxy_channel_top_bus0.sp5_i2c_has_bus : std_logic >>; variable cmd : cmd_type; variable data32 : std_logic_vector(31 downto 0); variable rnd : RandomPType; @@ -284,6 +326,99 @@ begin read_bus(net, bus_handle, To_StdLogicVector(SPD_RDATA_OFFSET, bus_handle.p_address_length), data32); check_match(data32, std_logic_vector'(X"DDCCBBAA"), "Read data mismatch"); + -- ----------------------------------------------------------------------- + -- Bug regression tests for dimm_arb_mux + -- ----------------------------------------------------------------------- + + elsif run("arb_bug1_sda_glitch_during_playback") then + -- Bug 1: when PLAY_STORED_DATA transitions to ENSURE_PLAYBACK_HOLD on a + -- playback SCL *rising* edge, the registered SDA update lands one cycle + -- later while SCL is still high. The concurrent sda_glitch_monitor process + -- will call check_false the moment it observes that glitch. + write_word(memory(I2C_DIMM1F_TGT_VC), 0, X"AA"); + cmd := (op => "00", bus_addr => address(I2C_DIMM1F_TGT_VC), reg_addr => X"00", len => X"01"); + wait for 15 us; + write_bus(net, bus_handle, To_StdLogicVector(BUS0_CMD_OFFSET, bus_handle.p_address_length), pack(cmd)); + wait for 50 us; + -- Arm the monitor before triggering the CPU conflict so it catches the + -- playback window from the very first cycle. + check_sda_glitch <= '1'; + push_byte(cpu_tx_q, to_integer(std_logic_vector'(X"DE"))); + i2c_write_txn(net, address(I2C_DIMM1F_TGT_VC), cpu_tx_q, cpu_ack_q, I2C_CTRL_VC0.p_actor); + -- Allow the full conflict/playback/handoff sequence to complete. + wait for 500 us; + check_sda_glitch <= '0'; + wait for 5 us; -- let monitor react to the disable + + elsif run("arb_bug2_playback_stall") then + -- Bug 2: in PLAY_STORED_DATA, when the playback catches up to bit_count at a + -- falling playback SCL edge while cpu_scl_in='1', no exit condition fires. + -- The next redge increments playback_bits past bit_count, and no subsequent + -- condition ever matches, so the machine spins in PLAY_STORED_DATA forever. + -- Observable: cpu_has_bus_sig never asserts → timeout below fails. + -- + -- The 20 us conflict window leaves only 1-2 buffered bits, so playback + -- catches up within the first CPU SCL high phase, reliably hitting the gap. + write_word(memory(I2C_DIMM1F_TGT_VC), 0, X"AA"); + cmd := (op => "00", bus_addr => address(I2C_DIMM1F_TGT_VC), reg_addr => X"00", len => X"01"); + wait for 15 us; + write_bus(net, bus_handle, To_StdLogicVector(BUS0_CMD_OFFSET, bus_handle.p_address_length), pack(cmd)); + wait for 20 us; + push_byte(cpu_tx_q, to_integer(std_logic_vector'(X"DE"))); + i2c_write_txn(net, address(I2C_DIMM1F_TGT_VC), cpu_tx_q, cpu_ack_q, I2C_CTRL_VC0.p_actor); + -- With a correct implementation the mux hands off within ~200 us. + -- With Bug 2 the machine never exits PLAY_STORED_DATA. + wait until cpu_has_bus_sig = '1' for 800 us; + check_true(cpu_has_bus_sig = '1', + "Bug 2: mux never handed bus to CPU -- stalled in PLAY_STORED_DATA"); + + elsif run("arb_bug3_start_hold_time") then + -- Bug 3: when the CPU starts a transaction while the bus is idle, the mux + -- reaches PLAY_STORED_START while sample_r.state=SAMPLE_START and + -- cpu_scl_in='1' (CPU still in start hold-time). The immediate branch + -- goes directly to CPU_HAS_BUS, skipping ENSURE_PLAYBACK_HOLD entirely. + -- The DIMM bus sees the FPGA drive SDA low for only one system clock (~8 ns) + -- before releasing it, far below the t_HD;STA minimum of 260 ns (FAST_PLUS). + -- Observable: playback_active drops after a single cycle instead of persisting + -- through ENSURE_PLAYBACK_HOLD (~500 ns for FAST_PLUS at 125 MHz). + wait for 15 us; + push_byte(cpu_tx_q, to_integer(std_logic_vector'(X"AA"))); + i2c_write_txn(net, address(I2C_DIMM1F_TGT_VC), cpu_tx_q, cpu_ack_q, I2C_CTRL_VC0.p_actor); + -- Wait for the mux to enter the playback window (PLAY_STORED_START asserts + -- sp5_playback_i2c_has_bus = '1'). + wait until playback_active = '1' for 100 us; + check_true(playback_active = '1', "Bug 3: playback never activated"); + -- ENSURE_PLAYBACK_HOLD keeps playback_active='1' for FAST_SCL_PERIOD/2 cycles + -- = 62 * 8 ns = ~496 ns. With Bug 3 it drops in a single cycle (~8 ns). + -- Checking at 200 ns reliably distinguishes the two cases. + wait for 200 ns; + check_true(playback_active = '1', + "Bug 3: playback_active dropped in under 200 ns -- " & + "ENSURE_PLAYBACK_HOLD was bypassed, start-condition hold time violated"); + + elsif run("arb_bug4_sample_error_stale_bitcount") then + -- Bug 4: SAMPLE_ERROR transitions to SAMPLE_IDLE without resetting bit_count. + -- To reach SAMPLE_ERROR the CPU must issue 7 rising SCL edges before the bus + -- is granted. At 100 kHz that takes ~70 us; a 150 us conflict delay gives + -- the FPGA enough time to start and be mid-transaction while the CPU sends + -- all 7 address bits before the abort+idle+grant sequence completes. + -- When bit_count is stale at 7 and playback starts, it interacts with Bug 2's + -- missing fedge exit (playback_bits=7 at fedge with cpu_scl_in='1'), causing + -- a permanent stall. The cpu_has_bus_sig timeout below exposes this. + write_word(memory(I2C_DIMM1F_TGT_VC), 0, X"AA"); + cmd := (op => "10", bus_addr => address(I2C_DIMM1F_TGT_VC), reg_addr => X"00", len => X"01"); + wait for 15 us; + write_bus(net, bus_handle, To_StdLogicVector(BUS0_CMD_OFFSET, bus_handle.p_address_length), pack(cmd)); + -- 150 us ensures the CPU emits all 7 address bits (70 us) before the FPGA + -- finishes its abort + bus-idle sequence, so SAMPLE_ERROR fires first. + wait for 150 us; + push_byte(cpu_tx_q, to_integer(std_logic_vector'(X"DE"))); + i2c_write_txn(net, address(I2C_DIMM1F_TGT_VC), cpu_tx_q, cpu_ack_q, I2C_CTRL_VC0.p_actor); + wait until cpu_has_bus_sig = '1' for 800 us; + check_true(cpu_has_bus_sig = '1', + "Bug 4: stale bit_count after SAMPLE_ERROR caused mux to hang " & + "-- CPU never received bus grant"); + elsif run("spd_sm_prefetch_again") then wait for 15 us; --allow power up clear write_word(memory(I2C_DIMM1F_TGT_VC), 16#80#, X"AA"); From 0f71a84738782cb4d911b8bd754a5df577c9f7eb Mon Sep 17 00:00:00 2001 From: Nathanael Huffman Date: Wed, 6 May 2026 09:23:08 -0400 Subject: [PATCH 2/6] Fix Bug 1: remove REDGE exit from PLAY_STORED_DATA The REDGE-exit branch (cpu_scl_in='1' catch-up) set dimm_sda_oe one registered cycle after the rising edge, while the playback SCL was still high -- producing a spurious start or stop condition on the DIMM bus. The FEDGE path already handles catch-up exit correctly, so just remove the REDGE branch and let playback_bits increment cleanly. --- .../dimms_subsystem/proxy_channel/dimm_arb_mux.vhd | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd index 98f4203f..7b23e97a 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd @@ -331,18 +331,9 @@ begin -- We're going to play back one bit per synthesized scl rising edge until we catch -- up with the sampled bits. We'll to CPU_HAS_BUS only if we've caught up -- and CPU's SCL is low so that the SDA handoff is clean. - + if playback_scl_redge = '1' then v.playback_bits := mux_r.playback_bits + 1; - if v.playback_bits = sample_r.bit_count and cpu_scl_in = '1' then - v.state := ENSURE_PLAYBACK_HOLD; - v.playback_done := '1'; - v.playback_bits := 0; - v.hold_timer := 0; - -- prevent small glitches or arbitration oddities. Since we've caught up - -- and are at a scl fedge, match the CPU's sda line for a smooth transition - v.dimm_sda_oe := not cpu_sda_in; - end if; elsif playback_scl_fedge = '1' and mux_r.playback_bits < sample_r.bit_count then -- oe = 1 is output = 0 so there's an inversion here. v.dimm_sda_oe := not sample_r.data_bits(mux_r.playback_bits); From 8839a921e4cd57896c4f96018905a7e7de70e712 Mon Sep 17 00:00:00 2001 From: Nathanael Huffman Date: Wed, 6 May 2026 15:10:50 -0400 Subject: [PATCH 3/6] Fix Bug 2: PLAY_STORED_DATA FEDGE exit must not gate on cpu_scl_in --- .../proxy_channel/dimm_arb_mux.vhd | 2 +- .../dimms_subsystem/sims/dimm_arb_mux_tb.vhd | 56 +++++++++++++++++++ .../dimms_subsystem/sims/spd_proxy_top_tb.vhd | 7 ++- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd index 7b23e97a..135ab6ad 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd @@ -337,7 +337,7 @@ begin elsif playback_scl_fedge = '1' and mux_r.playback_bits < sample_r.bit_count then -- oe = 1 is output = 0 so there's an inversion here. v.dimm_sda_oe := not sample_r.data_bits(mux_r.playback_bits); - elsif playback_scl_fedge = '1' and mux_r.playback_bits = sample_r.bit_count and cpu_scl_in = '0' then + elsif playback_scl_fedge = '1' and mux_r.playback_bits = sample_r.bit_count then -- we've played all the bits we have, just float sda until we can hand off v.state := ENSURE_PLAYBACK_HOLD; v.playback_done := '1'; diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd index ff61ace3..c0a619ea 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd @@ -182,6 +182,62 @@ begin bus_grant <= '0'; wait for 500 ns; + + elsif run("arb_bug2_playback_stall") then + -- Direct stimulus reproduction of Bug 2. + -- + -- Bug 2: in PLAY_STORED_DATA, when playback_bits catches up to bit_count + -- at a falling playback SCL edge while cpu_scl_in='1', the exit condition + -- `playback_scl_fedge='1' and playback_bits=bit_count and cpu_scl_in='0'` + -- is FALSE because cpu_scl_in is '1'. The following REDGE overshoots to + -- playback_bits=2 > bit_count=1, and no subsequent condition ever matches, + -- so the machine spins in PLAY_STORED_DATA forever. + -- + -- The setup is identical to the Bug 1 test (SAMPLE_DATA, bit_count=1) but + -- the check monitors sp5_i2c_has_bus rather than SDA. With Bug 2 fixed the + -- mux exits to ENSURE_PLAYBACK_HOLD → CPU_HAS_BUS within ~2 µs. + reset <= '1'; + clk_tick; + reset <= '0'; + wait for 10 us; + + -- CPU start condition: SAMPLE_IDLE → SAMPLE_START, mux: IDLE → CPU_REQ_GRANT. + cpu_scl_in <= '1'; + cpu_sda_in <= '0'; + cpu_sda_fedge <= '1'; + clk_tick; + cpu_sda_fedge <= '0'; + clk_tick; + + -- CPU SCL falling edge: SAMPLE_START → SAMPLE_DATA (bit_count=0). + cpu_scl_in <= '0'; + cpu_scl_fedge <= '1'; + clk_tick; + cpu_scl_fedge <= '0'; + clk_tick; + + -- CPU SCL rising edge: sample first bit ('0'), bit_count becomes 1. + cpu_scl_in <= '1'; + cpu_scl_redge <= '1'; + clk_tick; + cpu_scl_redge <= '0'; + clk_tick; + + -- Grant bus with sample_r.state=SAMPLE_DATA (not SAMPLE_START) so Bug 3 + -- cannot fire. Mux enters PLAY_STORED_START → PLAY_STORED_DATA. + bus_grant <= '1'; + wait until sp5_playback_i2c_has_bus = '1'; + + -- Keep cpu_scl_in='1' so Bug 2's exit condition (cpu_scl_in='0') is + -- blocked. With the fix removed, the mux never exits PLAY_STORED_DATA. + -- With Bug 2 fixed, the FEDGE exit fires when playback_bits=bit_count=1 + -- regardless of cpu_scl_in, and sp5_i2c_has_bus asserts within ~2 µs. + wait until sp5_i2c_has_bus = '1' for 50 us; + check_true(sp5_i2c_has_bus = '1', + "Bug 2: mux stalled in PLAY_STORED_DATA -- never reached CPU_HAS_BUS"); + + bus_grant <= '0'; + wait for 500 ns; end if; end loop; diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd index ae3d25f1..945b15d5 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd @@ -368,7 +368,12 @@ begin i2c_write_txn(net, address(I2C_DIMM1F_TGT_VC), cpu_tx_q, cpu_ack_q, I2C_CTRL_VC0.p_actor); -- With a correct implementation the mux hands off within ~200 us. -- With Bug 2 the machine never exits PLAY_STORED_DATA. - wait until cpu_has_bus_sig = '1' for 800 us; + -- Guard: NVC's wait-until requires a signal event; if cpu_has_bus_sig is + -- already '1' when i2c_write_txn returns (bus handed off during the + -- transaction), skip the wait and check the level directly. + if cpu_has_bus_sig /= '1' then + wait until cpu_has_bus_sig = '1' for 800 us; + end if; check_true(cpu_has_bus_sig = '1', "Bug 2: mux never handed bus to CPU -- stalled in PLAY_STORED_DATA"); From c29a5d918c296205e13b81173e1525c13d0a6f4b Mon Sep 17 00:00:00 2001 From: Nathanael Huffman Date: Wed, 6 May 2026 15:11:49 -0400 Subject: [PATCH 4/6] Fix Bug 3: PLAY_STORED_START must hold start condition via ENSURE_PLAYBACK_HOLD --- .../proxy_channel/dimm_arb_mux.vhd | 5 +++-- .../dimms_subsystem/sims/spd_proxy_top_tb.vhd | 15 +++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd index 135ab6ad..93bb8786 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd @@ -305,10 +305,11 @@ begin when PLAY_STORED_START => -- play back the stored start condition -- if the CPU is still in the start condition but SCL is still high - -- just hand over the bus immediately. We'll have generated start already here. + -- go through ENSURE_PLAYBACK_HOLD to satisfy I2C t_HD;STA hold time. v.dimm_sda_oe := '1'; --generate a start by pulling sda low if sample_r.state = SAMPLE_START and cpu_scl_in = '1' then - v.state := CPU_HAS_BUS; + v.state := ENSURE_PLAYBACK_HOLD; + v.hold_timer := 0; v.playback_done := '1'; elsif playback_scl_fedge = '1' then if sample_r.state = SAMPLE_DATA and sample_r.bit_count > 0 then diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd index 945b15d5..bf3c3ab5 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd @@ -86,6 +86,7 @@ begin variable rnd : RandomPType; variable cpu_tx_q : queue_t := new_queue; variable cpu_ack_q : queue_t := new_queue; + variable request_msg : msg_t; begin -- Always the first thing in the process, set up things for the VUnit test runner test_runner_setup(runner, runner_cfg); @@ -386,9 +387,13 @@ begin -- before releasing it, far below the t_HD;STA minimum of 260 ns (FAST_PLUS). -- Observable: playback_active drops after a single cycle instead of persisting -- through ENSURE_PLAYBACK_HOLD (~500 ns for FAST_PLUS at 125 MHz). + -- + -- Send the I2C START non-blocking so the bench can immediately wait for + -- the playback_active rising edge rather than blocking in i2c_write_txn + -- for ~200 us while the playback window opens and closes. wait for 15 us; - push_byte(cpu_tx_q, to_integer(std_logic_vector'(X"AA"))); - i2c_write_txn(net, address(I2C_DIMM1F_TGT_VC), cpu_tx_q, cpu_ack_q, I2C_CTRL_VC0.p_actor); + request_msg := new_msg(i2c_send_start); + send(net, I2C_CTRL_VC0.p_actor, request_msg); -- Wait for the mux to enter the playback window (PLAY_STORED_START asserts -- sp5_playback_i2c_has_bus = '1'). wait until playback_active = '1' for 100 us; @@ -400,6 +405,12 @@ begin check_true(playback_active = '1', "Bug 3: playback_active dropped in under 200 ns -- " & "ENSURE_PLAYBACK_HOLD was bypassed, start-condition hold time violated"); + -- Clean up: let the start finish, then stop the transaction. + wait_until_idle(net, I2C_CTRL_VC0.p_actor); + request_msg := new_msg(i2c_send_stop); + send(net, I2C_CTRL_VC0.p_actor, request_msg); + wait_until_idle(net, I2C_CTRL_VC0.p_actor); + wait for 100 us; elsif run("arb_bug4_sample_error_stale_bitcount") then -- Bug 4: SAMPLE_ERROR transitions to SAMPLE_IDLE without resetting bit_count. From 73e158492717ce768d5cc48a5695ea52f90f0d3c Mon Sep 17 00:00:00 2001 From: Nathanael Huffman Date: Wed, 6 May 2026 15:12:27 -0400 Subject: [PATCH 5/6] Fix Bug 4: SAMPLE_ERROR must reset bit_count before returning to SAMPLE_IDLE --- .../cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd | 1 + .../cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd index 93bb8786..05863b9c 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd @@ -232,6 +232,7 @@ begin when SAMPLE_ERROR => -- Unexpected state, go back to idle v.state := SAMPLE_IDLE; + v.bit_count := 0; end case; sample_rin <= v; diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd index bf3c3ab5..50e45e0a 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd @@ -430,7 +430,9 @@ begin wait for 150 us; push_byte(cpu_tx_q, to_integer(std_logic_vector'(X"DE"))); i2c_write_txn(net, address(I2C_DIMM1F_TGT_VC), cpu_tx_q, cpu_ack_q, I2C_CTRL_VC0.p_actor); - wait until cpu_has_bus_sig = '1' for 800 us; + if cpu_has_bus_sig /= '1' then + wait until cpu_has_bus_sig = '1' for 800 us; + end if; check_true(cpu_has_bus_sig = '1', "Bug 4: stale bit_count after SAMPLE_ERROR caused mux to hang " & "-- CPU never received bus grant"); From 637ca79a3f559895350d349e49fdd60c7fb62ad6 Mon Sep 17 00:00:00 2001 From: Nathanael Huffman Date: Wed, 6 May 2026 17:38:27 -0400 Subject: [PATCH 6/6] cleanup unused stuff --- .../dimms_subsystem/proxy_channel/dimm_arb_mux.vhd | 6 +----- .../cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd | 2 -- .../cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd | 1 + 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd index 05863b9c..050b97a2 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/proxy_channel/dimm_arb_mux.vhd @@ -66,7 +66,6 @@ entity dimm_arb_mux is fpga_i2c_has_bus : out std_logic; sp5_playback_i2c_has_bus : out std_logic; - forced_idle_delay : out std_logic; sp5_i2c_has_bus : out std_logic; ); @@ -125,13 +124,11 @@ architecture rtl of dimm_arb_mux is state : sample_state_t; data_bits : unsigned(6 downto 0); bit_count : integer range 0 to 7; - allowed_to_handoff : std_logic; end record; constant SAMPLE_REG_RESET : sample_reg_t := ( state => SAMPLE_IDLE, data_bits => (others => '0'), - bit_count => 0, - allowed_to_handoff => '0' + bit_count => 0 ); signal sample_r, sample_rin : sample_reg_t; signal dimm_i2c_idle : std_logic; @@ -160,7 +157,6 @@ begin mux_r.state = POWER_UP_CLEAR or mux_r.state = ENSURE_PLAYBACK_HOLD else '0'; sp5_i2c_has_bus <= '1' when mux_r.state = CPU_HAS_BUS else '0'; - forced_idle_delay <= '1' when mux_r.state = BUS_IDLE_DELAY else '0'; fpga_i2c_abort_or_finish <= mux_r.fpga_abort_or_finish; -- need to enforce minimum idle time on the bus before we can diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd index c0a619ea..0af52892 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/dimm_arb_mux_tb.vhd @@ -54,7 +54,6 @@ architecture tb of dimm_arb_mux_tb is signal fpga_i2c_has_bus : std_logic; signal sp5_playback_i2c_has_bus : std_logic; - signal forced_idle_delay : std_logic; signal sp5_i2c_has_bus : std_logic; begin @@ -90,7 +89,6 @@ begin playback_scl => playback_scl_if, fpga_i2c_has_bus => fpga_i2c_has_bus, sp5_playback_i2c_has_bus => sp5_playback_i2c_has_bus, - forced_idle_delay => forced_idle_delay, sp5_i2c_has_bus => sp5_i2c_has_bus ); diff --git a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd index 50e45e0a..0a971899 100644 --- a/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd +++ b/hdl/projects/cosmo_seq/dimms_subsystem/sims/spd_proxy_top_tb.vhd @@ -357,6 +357,7 @@ begin -- The next redge increments playback_bits past bit_count, and no subsequent -- condition ever matches, so the machine spins in PLAY_STORED_DATA forever. -- Observable: cpu_has_bus_sig never asserts → timeout below fails. + -- See GH # 497 -- -- The 20 us conflict window leaves only 1-2 buffered bits, so playback -- catches up within the first CPU SCL high phase, reliably hitting the gap.