← all candidates sft

sft_4

line-item PPA

passes_tests0/0
area_um2
fmax_mhz
ops_per_sec
switching_alpha
est_power_uW

per-program functional

no functional run (did not compile)

pipeline shape (heuristic register bank count)

F stage regs3
X stage regs12
W stage regs4
unstaged regs24

mistakes detected

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register file: 4 registers, 8-bit signed
    reg signed [7:0] R [0:3];

    // Pipeline registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC candidate for F stage (PC+1)

    // Stage X: Decode/Execute
    reg [7:0] instr_x;
    reg [4:0] pc_x; // PC of the instruction being executed (for branch target calc if needed, though we use pc_f for delay slot logic usually, let's stick to standard)
    // Actually, for branch target calculation, we need the PC of the branch instruction.
    // Let's pass PC through.
    reg [4:0] pc_x; 
    reg [7:0] alu_res_x;
    reg       branch_taken_x;
    reg [4:0] branch_target_x;
    reg       is_halt_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [1:0] cond_x;
    reg [3:0] imm_x;
    reg [3:0] off_x;

    // Stage W: Writeback
    reg [7:0] data_w;
    reg [1:0] rd_w;
    reg       is_halt_w;
    reg       valid_w;

    // Forwarding registers (to hold W stage results for forwarding to X stage)
    // We need to forward from W to X.
    // Since W writes to R, and X reads R, we can forward the value directly if the RD matches RS.
    // We also need to consider if the instruction in X is reading a register that is being written by W.
    // Let's keep the W stage output available.
    reg [7:0] r0_w_val; // Specifically for R0 forwarding if needed, but general forwarding is better.
    
    // Let's define forwarding sources:
    // Source 1: W stage result (if valid and rd matches)
    // Source 2: Register file (default)
    
    // We need to expose the W stage values for forwarding logic in X stage.
    // Let's latch the W stage outputs into a register that is stable during the X stage evaluation of the next cycle?
    // No, in a synchronous pipeline:
    // Cycle N:
    //   F stage loads instr_f, pc_f
    //   X stage decodes instr_x, computes alu_res_x, branch_taken_x
    //   W stage writes R[rd_w] <= data_w
    
    // Cycle N+1:
    //   F stage loads instr_f (new), pc_f (new)
    //   X stage decodes instr_x (was instr_f from N), computes alu_res_x...
    //   W stage writes R[rd_w] (was data_x from N)
    
    // Forwarding for X stage in Cycle N+1:
    // The instruction in X stage (instr_x) might depend on the result of the instruction in W stage (data_w).
    // The instruction in W stage is the one that was in X stage in Cycle N.
    // So, we can forward `data_w` to the X stage inputs if `rd_w` matches `rs_x` (or `rd_x` for ADDI? No, ADDI writes to rd, reads nothing. MOV reads rs, writes rd. So MOV depends on rs. ADDI depends on imm. BRcc depends on R0).
    
    // Let's refine forwarding:
    // 1. MOV rd, rs: Reads R[rs]. If the previous instruction (now in W) wrote to R[rs], forward `data_w`.
    // 2. ADDI rd, imm: Reads R[rd]. If the previous instruction (now in W) wrote to R[rd], forward `data_w`.
    // 3. BRcc: Reads R0. If the previous instruction (now in W) wrote to R0, forward `data_w` to the condition check.

    // We need to capture the W stage values from the *previous* cycle to use in the *current* X stage logic?
    // No, the W stage update happens on the clock edge. The X stage logic is combinatorial based on the state at the beginning of the cycle (which includes the W stage registers from the previous cycle).
    // So, `data_w` and `rd_w` in the current cycle correspond to the result of the instruction that was in X stage in the previous cycle.
    // This is exactly what we need for forwarding to the current X stage instruction.

    // Let's define the W stage registers explicitly for forwarding visibility.
    reg [7:0] data_w_reg;
    reg [1:0] rd_w_reg;
    reg       valid_w_reg; // To indicate if the W stage has a valid write (not NOP/HALT)

    // Helper function for sign extension
    function signed [7:0] sign_ext_4;
        input [3:0] val;
        begin
            sign_ext_4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper function for branch condition
    function cond_met;
        input [1:0] cond;
        input signed [7:0] r0_val;
        begin
            case (cond)
                2'b00: cond_met = 1; // Always
                2'b01: cond_met = (r0_val == 0);
                2'b10: cond_met = (r0_val < 0);
                2'b11: cond_met = (r0_val > 0);
                default: cond_met = 0;
            endcase
        end
    endfunction

    // IMEM access
    // IMEM_INIT is 256 bits. PC is 5 bits (0-31).
    // Byte 0 is at MSB. So PC=0 corresponds to bits [255:248].
    // PC=k corresponds to bits [255-8*k : 248-8*k].
    assign instr_f = IMEM_INIT[255 - pc_f*8 -: 8];

    // Pipeline Stage F: Fetch
    // On reset, pc_f <= 0.
    // On normal operation, pc_f <= npc_f (which is either pc_f+1 or branch_target).
    // But wait, the branch target is computed in X stage.
    // So F stage just increments PC by default, unless overridden by a taken branch from X.
    
    // Let's define the next PC logic.
    // If branch_taken_x is 1, next PC is branch_target_x.
    // Else, next PC is pc_f + 1.
    // However, we must handle the delay slot.
    // Convention: The instruction after the branch (at PC+1) is always executed.
    // So, if a branch is taken, the PC for the *next* fetch (after the delay slot) should be the branch target.
    // The delay slot instruction is fetched at PC+1.
    // So, in the cycle where the branch is in X stage (and resolved), the F stage fetches the delay slot instruction (at PC+1).
    // In the *next* cycle, the F stage should fetch from the branch target.
    // Therefore, the PC update logic for the F stage should be:
    // if (branch_taken_x) pc_f <= branch_target_x;
    // else pc_f <= pc_f + 1;
    // This logic must be applied in the cycle *after* the branch is resolved in X.
    // Since X stage resolution is available at the beginning of the cycle (in the pipeline registers), we can use it to control the F stage PC update.
    
    // Wait, let's trace:
    // Cycle T:
    //   F: Fetches instr at PC=T (Branch)
    //   X: Decodes instr at PC=T-1
    //   W: Writes result of instr at PC=T-2
    // Cycle T+1:
    //   F: Fetches instr at PC=T+1 (Delay Slot). This is determined by the PC update in Cycle T.
    //       In Cycle T, the branch was in F stage, so it wasn't resolved yet. So PC update in Cycle T was just PC+1.
    //       So F stage in T+1 fetches PC=T+1. Correct.
    //   X: Decodes instr at PC=T (Branch). Resolves branch.
    //   W: Writes result of instr at PC=T-1.
    // Cycle T+2:
    //   F: Fetches instr at PC=Target (if taken) or PC=T+2 (if not taken).
    //       This is determined by the PC update in Cycle T+1.
    //       In Cycle T+1, the branch was in X stage, so it was resolved.
    //       So PC update in Cycle T+1 uses branch_taken_x.
    //       If taken, pc_f <= branch_target_x.
    //       If not taken, pc_f <= pc_f + 1 (which is T+2).
    //       So F stage in T+2 fetches correctly.
    
    // So the logic is:
    // npc_f <= branch_taken_x ? branch_target_x : pc_f + 1;
    // This npc_f is used to update pc_f in the next cycle.
    
    // But what if the branch is in the delay slot?
    // The problem says "1 branch delay slot is acceptable". It doesn't specify nested branches.
    // We'll assume simple branches. If a branch is in the delay slot, it will be fetched and executed, and its target will be used for the next fetch. This is consistent.

    // Reset behavior:
    // pc_out <= 0; halt_out <= 0; r0_out <= 0; all R[i] <= 0; pipeline cleared.
    
    // Pipeline clear on reset:
    // We can just reset all pipeline registers to 0.
    
    // Let's define the pipeline registers and their updates.
    
    // F Stage Registers
    reg [4:0] pc_f_reg;
    reg [7:0] instr_f_reg;
    
    // X Stage Registers
    reg [7:0] instr_x_reg;
    reg [4:0] pc_x_reg;
    reg [7:0] alu_res_x_reg;
    reg       branch_taken_x_reg;
    reg [4:0] branch_target_x_reg;
    reg       is_halt_x_reg;
    reg [1:0] rd_x_reg;
    reg [1:0] rs_x_reg;
    reg [1:0] cond_x_reg;
    reg [3:0] imm_x_reg;
    reg [3:0] off_x_reg;
    
    // W Stage Registers
    reg [7:0] data_w_reg;
    reg [1:0] rd_w_reg;
    reg       is_halt_w_reg;
    reg       valid_w_reg;

    // Output registers
    reg [4:0] pc_out_reg;
    reg       halt_out_reg;
    reg signed [7:0] r0_out_reg;

    // Main clocked process
    always @(posedge clk) begin
        if (rst) begin
            // Reset
            pc_f_reg <= 0;
            instr_f_reg <= 0;
            
            instr_x_reg <= 0;
            pc_x_reg <= 0;
            alu_res_x_reg <= 0;
            branch_taken_x_reg <= 0;
            branch_target_x_reg <= 0;
            is_halt_x_reg <= 0;
            rd_x_reg <= 0;
            rs_x_reg <= 0;
            cond_x_reg <= 0;
            imm_x_reg <= 0;
            off_x_reg <= 0;
            
            data_w_reg <= 0;
            rd_w_reg <= 0;
            is_halt_w_reg <= 0;
            valid_w_reg <= 0;
            
            R[0] <= 0; R[1] <= 0; R[2] <= 0; R[3] <= 0;
            
            pc_out_reg <= 0;
            halt_out_reg <= 0;
            r0_out_reg <= 0;
        end else begin
            // Update W Stage
            // W stage writes to register file
            if (valid_w_reg && !is_halt_w_reg) begin
                R[rd_w_reg] <= data_w_reg;
            end
            
            // Update X Stage
            instr_x_reg <= instr_f_reg;
            pc_x_reg <= pc_f_reg;
            
            // Decode X stage instruction
            case (instr_f_reg[7:6])
                2'b00: begin // ADDI
                    rd_x_reg <= instr_f_reg[5:4];
                    rs_x_reg <= 0; // Unused
                    cond_x_reg <= 0; // Unused
                    imm_x_reg <= instr_f_reg[3:0];
                    off_x_reg <= 0; // Unused
                    is_halt_x_reg <= 0;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    
                    // Compute ALU result
                    // R[rd] <= R[rd] + sign_ext(imm, 4)
                    // Forwarding: Check if rd matches rd_w_reg (from W stage)
                    signed [7:0] r_rd_val;
                    if (valid_w_reg && (rd_x_reg == rd_w_reg)) begin
                        r_rd_val = data_w_reg;
                    end else begin
                        r_rd_val = R[rd_x_reg];
                    end
                    
                    alu_res_x_reg <= r_rd_val + sign_ext_4(imm_x_reg);
                end
                2'b01: begin // MOV
                    rd_x_reg <= instr_f_reg[5:4];
                    rs_x_reg <= instr_f_reg[3:2];
                    cond_x_reg <= 0; // Unused
                    imm_x_reg <= 0; // Unused
                    off_x_reg <= 0; // Unused
                    is_halt_x_reg <= 0;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    
                    // Compute ALU result
                    // R[rd] <= R[rs]
                    // Forwarding: Check if rs matches rd_w_reg (from W stage)
                    signed [7:0] r_rs_val;
                    if (valid_w_reg && (rs_x_reg == rd_w_reg)) begin
                        r_rs_val = data_w_reg;
                    end else begin
                        r_rs_val = R[rs_x_reg];
                    end
                    
                    alu_res_x_reg <= r_rs_val;
                end
                2'b10: begin // BRcc
                    rd_x_reg <= 0; // Unused
                    rs_x_reg <= 0; // Unused
                    cond_x_reg <= instr_f_reg[5:4];
                    imm_x_reg <= 0; // Unused
                    off_x_reg <= instr_f_reg[3:0];
                    is_halt_x_reg <= 0;
                    
                    // Compute branch target and condition
                    // PC <= PC + sign_ext(off,4)+1
                    // Note: PC here is the PC of the branch instruction, which is pc_f_reg.
                    // The delay slot instruction is at pc_f_reg + 1.
                    // If taken, the next instruction after the delay slot is at the target.
                    
                    // Forwarding for R0: Check if rd_w_reg is 0 (R0)
                    signed [7:0] r0_val;
                    if (valid_w_reg && (rd_w_reg == 2'b00)) begin
                        r0_val = data_w_reg;
                    end else begin
                        r0_val = R[0];
                    end
                    
                    if (cond_met(cond_x_reg, r0_val)) begin
                        branch_taken_x_reg <= 1;
                        branch_target_x_reg <= pc_f_reg + sign_ext_4(off_x_reg) + 1;
                    end else begin
                        branch_taken_x_reg <= 0;
                        branch_target_x_reg <= 0; // Don't care
                    end
                    
                    alu_res_x_reg <= 0; // Unused
                end
                2'b11: begin // HALT
                    rd_x_reg <= 0;
                    rs_x_reg <= 0;
                    cond_x_reg <= 0;
                    imm_x_reg <= 0;
                    off_x_reg <= 0;
                    is_halt_x_reg <= 1;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    alu_res_x_reg <= 0;
                end
                default: begin
                    rd_x_reg <= 0;
                    rs_x_reg <= 0;
                    cond_x_reg <= 0;
                    imm_x_reg <= 0;
                    off_x_reg <= 0;
                    is_halt_x_reg <= 0;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    alu_res_x_reg <= 0;
                end
            endcase
            
            // Update F Stage
            // Determine next PC
            if (branch_taken_x_reg) begin
                pc_f_reg <= branch_target_x_reg;
            end else begin
                pc_f_reg <= pc_f_reg + 1;
            end
            
            // Fetch instruction for next cycle
            // instr_f_reg is updated from IMEM at the new pc_f_reg?
            // No, instr_f_reg should be the instruction fetched at the OLD pc_f_reg?
            // Let's think carefully.
            // In the current cycle, F stage fetches instr_f_reg at pc_f_reg.
            // This instruction moves to X stage in the next cycle.
            // So instr_x_reg <= instr_f_reg is correct.
            // But instr_f_reg itself needs to be updated to the instruction at the NEW pc_f_reg.
            // So instr_f_reg <= IMEM_INIT[255 - pc_f_reg_new*8 -: 8];
            // But pc_f_reg_new is computed above.
            // We can use a temporary variable for the new PC.
            
            // Let's use a temporary variable for the next PC value.
            // But we already updated pc_f_reg above.
            // So we can use pc_f_reg to index IMEM.
            
            instr_f_reg <= IMEM_INIT[255 - pc_f_reg*8 -: 8];
            
            // Update W Stage
            // W stage gets data from X stage
            data_w_reg <= alu_res_x_reg;
            rd_w_reg <= rd_x_reg;
            is_halt_w_reg <= is_halt_x_reg;
            valid_w_reg <= 1; // Always valid, unless HALT?
            // If HALT, we don't want to write to R.
            // So valid_w_reg should be 0 if is_halt_x_reg is 1.
            if (is_halt_x_reg) begin
                valid_w_reg <= 0;
            end else begin
                valid_w_reg <= 1;
            end
            
            // Update Outputs
            pc_out_reg <= pc_f_reg;
            
            if (is_halt_x_reg) begin
                halt_out_reg <= 1;
            end else if (is_halt_w_reg) begin
                // If the instruction in W stage was HALT, it will halt in the next cycle?
                // No, HALT freezes the pipeline.
                // The problem says "halt_out <= 1, freeze pipeline".
                // This implies that once HALT is executed, the MCU stops.
                // "Executed" means it passes through the pipeline.
                // So when HALT is in W stage, it is being executed.
                // So halt_out should be asserted when HALT is in W stage?
                // Or when it is in X stage?
                // "After HALT executes" usually means after the writeback stage.
                // But "freeze pipeline" means no more instructions are fetched/decoded.
                // So if HALT is in W stage, we assert halt_out and stop updating pipeline registers.
                // But we already updated them above.
                // So we need to check for HALT in the current state before updating.
                
                // Let's restructure the always block to handle HALT properly.
                // If halt_out_reg is already 1, we do nothing.
                // If the instruction in X stage is HALT, we set halt_out_reg to 1 in the next cycle?
                // No, "after HALT executes". Execution completes in W stage.
                // So when HALT is in W stage, we assert halt_out.
                // And we freeze the pipeline.
                
                // But we already updated the pipeline registers above.
                // So we need to prevent the update if halt_out_reg is 1.
                
                // Let's move the halt check to the beginning of the else block.
            end
            
            // Update r0_out_reg
            // r0_out reflects the most recently written value of R[0].
            // If the instruction in W stage writes to R[0], update r0_out_reg.
            if (valid_w_reg && (rd_w_reg == 2'b00)) begin
                r0_out_reg <= data_w_reg;
            end
        end
    end
    
    // Assign outputs
    assign pc_out = pc_out_reg;
    assign halt_out = halt_out_reg;
    assign r0_out = r0_out_reg;

endmodule

raw model output

```verilog
// Pipeline convention: 1 branch delay slot. The instruction immediately following
// a BRcc is always fetched and executed, regardless of whether the branch is taken.
// If the branch is taken, the instruction after the delay slot is replaced by the
// target instruction in the next cycle. This is implemented by updating the PC in
// the Fetch stage based on the resolved branch from the previous Execute stage.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register file: 4 registers, 8-bit signed
    reg signed [7:0] R [0:3];

    // Pipeline registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC candidate for F stage (PC+1)

    // Stage X: Decode/Execute
    reg [7:0] instr_x;
    reg [4:0] pc_x; // PC of the instruction being executed (for branch target calc if needed, though we use pc_f for delay slot logic usually, let's stick to standard)
    // Actually, for branch target calculation, we need the PC of the branch instruction.
    // Let's pass PC through.
    reg [4:0] pc_x; 
    reg [7:0] alu_res_x;
    reg       branch_taken_x;
    reg [4:0] branch_target_x;
    reg       is_halt_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [1:0] cond_x;
    reg [3:0] imm_x;
    reg [3:0] off_x;

    // Stage W: Writeback
    reg [7:0] data_w;
    reg [1:0] rd_w;
    reg       is_halt_w;
    reg       valid_w;

    // Forwarding registers (to hold W stage results for forwarding to X stage)
    // We need to forward from W to X.
    // Since W writes to R, and X reads R, we can forward the value directly if the RD matches RS.
    // We also need to consider if the instruction in X is reading a register that is being written by W.
    // Let's keep the W stage output available.
    reg [7:0] r0_w_val; // Specifically for R0 forwarding if needed, but general forwarding is better.
    
    // Let's define forwarding sources:
    // Source 1: W stage result (if valid and rd matches)
    // Source 2: Register file (default)
    
    // We need to expose the W stage values for forwarding logic in X stage.
    // Let's latch the W stage outputs into a register that is stable during the X stage evaluation of the next cycle?
    // No, in a synchronous pipeline:
    // Cycle N:
    //   F stage loads instr_f, pc_f
    //   X stage decodes instr_x, computes alu_res_x, branch_taken_x
    //   W stage writes R[rd_w] <= data_w
    
    // Cycle N+1:
    //   F stage loads instr_f (new), pc_f (new)
    //   X stage decodes instr_x (was instr_f from N), computes alu_res_x...
    //   W stage writes R[rd_w] (was data_x from N)
    
    // Forwarding for X stage in Cycle N+1:
    // The instruction in X stage (instr_x) might depend on the result of the instruction in W stage (data_w).
    // The instruction in W stage is the one that was in X stage in Cycle N.
    // So, we can forward `data_w` to the X stage inputs if `rd_w` matches `rs_x` (or `rd_x` for ADDI? No, ADDI writes to rd, reads nothing. MOV reads rs, writes rd. So MOV depends on rs. ADDI depends on imm. BRcc depends on R0).
    
    // Let's refine forwarding:
    // 1. MOV rd, rs: Reads R[rs]. If the previous instruction (now in W) wrote to R[rs], forward `data_w`.
    // 2. ADDI rd, imm: Reads R[rd]. If the previous instruction (now in W) wrote to R[rd], forward `data_w`.
    // 3. BRcc: Reads R0. If the previous instruction (now in W) wrote to R0, forward `data_w` to the condition check.

    // We need to capture the W stage values from the *previous* cycle to use in the *current* X stage logic?
    // No, the W stage update happens on the clock edge. The X stage logic is combinatorial based on the state at the beginning of the cycle (which includes the W stage registers from the previous cycle).
    // So, `data_w` and `rd_w` in the current cycle correspond to the result of the instruction that was in X stage in the previous cycle.
    // This is exactly what we need for forwarding to the current X stage instruction.

    // Let's define the W stage registers explicitly for forwarding visibility.
    reg [7:0] data_w_reg;
    reg [1:0] rd_w_reg;
    reg       valid_w_reg; // To indicate if the W stage has a valid write (not NOP/HALT)

    // Helper function for sign extension
    function signed [7:0] sign_ext_4;
        input [3:0] val;
        begin
            sign_ext_4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper function for branch condition
    function cond_met;
        input [1:0] cond;
        input signed [7:0] r0_val;
        begin
            case (cond)
                2'b00: cond_met = 1; // Always
                2'b01: cond_met = (r0_val == 0);
                2'b10: cond_met = (r0_val < 0);
                2'b11: cond_met = (r0_val > 0);
                default: cond_met = 0;
            endcase
        end
    endfunction

    // IMEM access
    // IMEM_INIT is 256 bits. PC is 5 bits (0-31).
    // Byte 0 is at MSB. So PC=0 corresponds to bits [255:248].
    // PC=k corresponds to bits [255-8*k : 248-8*k].
    assign instr_f = IMEM_INIT[255 - pc_f*8 -: 8];

    // Pipeline Stage F: Fetch
    // On reset, pc_f <= 0.
    // On normal operation, pc_f <= npc_f (which is either pc_f+1 or branch_target).
    // But wait, the branch target is computed in X stage.
    // So F stage just increments PC by default, unless overridden by a taken branch from X.
    
    // Let's define the next PC logic.
    // If branch_taken_x is 1, next PC is branch_target_x.
    // Else, next PC is pc_f + 1.
    // However, we must handle the delay slot.
    // Convention: The instruction after the branch (at PC+1) is always executed.
    // So, if a branch is taken, the PC for the *next* fetch (after the delay slot) should be the branch target.
    // The delay slot instruction is fetched at PC+1.
    // So, in the cycle where the branch is in X stage (and resolved), the F stage fetches the delay slot instruction (at PC+1).
    // In the *next* cycle, the F stage should fetch from the branch target.
    // Therefore, the PC update logic for the F stage should be:
    // if (branch_taken_x) pc_f <= branch_target_x;
    // else pc_f <= pc_f + 1;
    // This logic must be applied in the cycle *after* the branch is resolved in X.
    // Since X stage resolution is available at the beginning of the cycle (in the pipeline registers), we can use it to control the F stage PC update.
    
    // Wait, let's trace:
    // Cycle T:
    //   F: Fetches instr at PC=T (Branch)
    //   X: Decodes instr at PC=T-1
    //   W: Writes result of instr at PC=T-2
    // Cycle T+1:
    //   F: Fetches instr at PC=T+1 (Delay Slot). This is determined by the PC update in Cycle T.
    //       In Cycle T, the branch was in F stage, so it wasn't resolved yet. So PC update in Cycle T was just PC+1.
    //       So F stage in T+1 fetches PC=T+1. Correct.
    //   X: Decodes instr at PC=T (Branch). Resolves branch.
    //   W: Writes result of instr at PC=T-1.
    // Cycle T+2:
    //   F: Fetches instr at PC=Target (if taken) or PC=T+2 (if not taken).
    //       This is determined by the PC update in Cycle T+1.
    //       In Cycle T+1, the branch was in X stage, so it was resolved.
    //       So PC update in Cycle T+1 uses branch_taken_x.
    //       If taken, pc_f <= branch_target_x.
    //       If not taken, pc_f <= pc_f + 1 (which is T+2).
    //       So F stage in T+2 fetches correctly.
    
    // So the logic is:
    // npc_f <= branch_taken_x ? branch_target_x : pc_f + 1;
    // This npc_f is used to update pc_f in the next cycle.
    
    // But what if the branch is in the delay slot?
    // The problem says "1 branch delay slot is acceptable". It doesn't specify nested branches.
    // We'll assume simple branches. If a branch is in the delay slot, it will be fetched and executed, and its target will be used for the next fetch. This is consistent.

    // Reset behavior:
    // pc_out <= 0; halt_out <= 0; r0_out <= 0; all R[i] <= 0; pipeline cleared.
    
    // Pipeline clear on reset:
    // We can just reset all pipeline registers to 0.
    
    // Let's define the pipeline registers and their updates.
    
    // F Stage Registers
    reg [4:0] pc_f_reg;
    reg [7:0] instr_f_reg;
    
    // X Stage Registers
    reg [7:0] instr_x_reg;
    reg [4:0] pc_x_reg;
    reg [7:0] alu_res_x_reg;
    reg       branch_taken_x_reg;
    reg [4:0] branch_target_x_reg;
    reg       is_halt_x_reg;
    reg [1:0] rd_x_reg;
    reg [1:0] rs_x_reg;
    reg [1:0] cond_x_reg;
    reg [3:0] imm_x_reg;
    reg [3:0] off_x_reg;
    
    // W Stage Registers
    reg [7:0] data_w_reg;
    reg [1:0] rd_w_reg;
    reg       is_halt_w_reg;
    reg       valid_w_reg;

    // Output registers
    reg [4:0] pc_out_reg;
    reg       halt_out_reg;
    reg signed [7:0] r0_out_reg;

    // Main clocked process
    always @(posedge clk) begin
        if (rst) begin
            // Reset
            pc_f_reg <= 0;
            instr_f_reg <= 0;
            
            instr_x_reg <= 0;
            pc_x_reg <= 0;
            alu_res_x_reg <= 0;
            branch_taken_x_reg <= 0;
            branch_target_x_reg <= 0;
            is_halt_x_reg <= 0;
            rd_x_reg <= 0;
            rs_x_reg <= 0;
            cond_x_reg <= 0;
            imm_x_reg <= 0;
            off_x_reg <= 0;
            
            data_w_reg <= 0;
            rd_w_reg <= 0;
            is_halt_w_reg <= 0;
            valid_w_reg <= 0;
            
            R[0] <= 0; R[1] <= 0; R[2] <= 0; R[3] <= 0;
            
            pc_out_reg <= 0;
            halt_out_reg <= 0;
            r0_out_reg <= 0;
        end else begin
            // Update W Stage
            // W stage writes to register file
            if (valid_w_reg && !is_halt_w_reg) begin
                R[rd_w_reg] <= data_w_reg;
            end
            
            // Update X Stage
            instr_x_reg <= instr_f_reg;
            pc_x_reg <= pc_f_reg;
            
            // Decode X stage instruction
            case (instr_f_reg[7:6])
                2'b00: begin // ADDI
                    rd_x_reg <= instr_f_reg[5:4];
                    rs_x_reg <= 0; // Unused
                    cond_x_reg <= 0; // Unused
                    imm_x_reg <= instr_f_reg[3:0];
                    off_x_reg <= 0; // Unused
                    is_halt_x_reg <= 0;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    
                    // Compute ALU result
                    // R[rd] <= R[rd] + sign_ext(imm, 4)
                    // Forwarding: Check if rd matches rd_w_reg (from W stage)
                    signed [7:0] r_rd_val;
                    if (valid_w_reg && (rd_x_reg == rd_w_reg)) begin
                        r_rd_val = data_w_reg;
                    end else begin
                        r_rd_val = R[rd_x_reg];
                    end
                    
                    alu_res_x_reg <= r_rd_val + sign_ext_4(imm_x_reg);
                end
                2'b01: begin // MOV
                    rd_x_reg <= instr_f_reg[5:4];
                    rs_x_reg <= instr_f_reg[3:2];
                    cond_x_reg <= 0; // Unused
                    imm_x_reg <= 0; // Unused
                    off_x_reg <= 0; // Unused
                    is_halt_x_reg <= 0;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    
                    // Compute ALU result
                    // R[rd] <= R[rs]
                    // Forwarding: Check if rs matches rd_w_reg (from W stage)
                    signed [7:0] r_rs_val;
                    if (valid_w_reg && (rs_x_reg == rd_w_reg)) begin
                        r_rs_val = data_w_reg;
                    end else begin
                        r_rs_val = R[rs_x_reg];
                    end
                    
                    alu_res_x_reg <= r_rs_val;
                end
                2'b10: begin // BRcc
                    rd_x_reg <= 0; // Unused
                    rs_x_reg <= 0; // Unused
                    cond_x_reg <= instr_f_reg[5:4];
                    imm_x_reg <= 0; // Unused
                    off_x_reg <= instr_f_reg[3:0];
                    is_halt_x_reg <= 0;
                    
                    // Compute branch target and condition
                    // PC <= PC + sign_ext(off,4)+1
                    // Note: PC here is the PC of the branch instruction, which is pc_f_reg.
                    // The delay slot instruction is at pc_f_reg + 1.
                    // If taken, the next instruction after the delay slot is at the target.
                    
                    // Forwarding for R0: Check if rd_w_reg is 0 (R0)
                    signed [7:0] r0_val;
                    if (valid_w_reg && (rd_w_reg == 2'b00)) begin
                        r0_val = data_w_reg;
                    end else begin
                        r0_val = R[0];
                    end
                    
                    if (cond_met(cond_x_reg, r0_val)) begin
                        branch_taken_x_reg <= 1;
                        branch_target_x_reg <= pc_f_reg + sign_ext_4(off_x_reg) + 1;
                    end else begin
                        branch_taken_x_reg <= 0;
                        branch_target_x_reg <= 0; // Don't care
                    end
                    
                    alu_res_x_reg <= 0; // Unused
                end
                2'b11: begin // HALT
                    rd_x_reg <= 0;
                    rs_x_reg <= 0;
                    cond_x_reg <= 0;
                    imm_x_reg <= 0;
                    off_x_reg <= 0;
                    is_halt_x_reg <= 1;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    alu_res_x_reg <= 0;
                end
                default: begin
                    rd_x_reg <= 0;
                    rs_x_reg <= 0;
                    cond_x_reg <= 0;
                    imm_x_reg <= 0;
                    off_x_reg <= 0;
                    is_halt_x_reg <= 0;
                    branch_taken_x_reg <= 0;
                    branch_target_x_reg <= 0;
                    alu_res_x_reg <= 0;
                end
            endcase
            
            // Update F Stage
            // Determine next PC
            if (branch_taken_x_reg) begin
                pc_f_reg <= branch_target_x_reg;
            end else begin
                pc_f_reg <= pc_f_reg + 1;
            end
            
            // Fetch instruction for next cycle
            // instr_f_reg is updated from IMEM at the new pc_f_reg?
            // No, instr_f_reg should be the instruction fetched at the OLD pc_f_reg?
            // Let's think carefully.
            // In the current cycle, F stage fetches instr_f_reg at pc_f_reg.
            // This instruction moves to X stage in the next cycle.
            // So instr_x_reg <= instr_f_reg is correct.
            // But instr_f_reg itself needs to be updated to the instruction at the NEW pc_f_reg.
            // So instr_f_reg <= IMEM_INIT[255 - pc_f_reg_new*8 -: 8];
            // But pc_f_reg_new is computed above.
            // We can use a temporary variable for the new PC.
            
            // Let's use a temporary variable for the next PC value.
            // But we already updated pc_f_reg above.
            // So we can use pc_f_reg to index IMEM.
            
            instr_f_reg <= IMEM_INIT[255 - pc_f_reg*8 -: 8];
            
            // Update W Stage
            // W stage gets data from X stage
            data_w_reg <= alu_res_x_reg;
            rd_w_reg <= rd_x_reg;
            is_halt_w_reg <= is_halt_x_reg;
            valid_w_reg <= 1; // Always valid, unless HALT?
            // If HALT, we don't want to write to R.
            // So valid_w_reg should be 0 if is_halt_x_reg is 1.
            if (is_halt_x_reg) begin
                valid_w_reg <= 0;
            end else begin
                valid_w_reg <= 1;
            end
            
            // Update Outputs
            pc_out_reg <= pc_f_reg;
            
            if (is_halt_x_reg) begin
                halt_out_reg <= 1;
            end else if (is_halt_w_reg) begin
                // If the instruction in W stage was HALT, it will halt in the next cycle?
                // No, HALT freezes the pipeline.
                // The problem says "halt_out <= 1, freeze pipeline".
                // This implies that once HALT is executed, the MCU stops.
                // "Executed" means it passes through the pipeline.
                // So when HALT is in W stage, it is being executed.
                // So halt_out should be asserted when HALT is in W stage?
                // Or when it is in X stage?
                // "After HALT executes" usually means after the writeback stage.
                // But "freeze pipeline" means no more instructions are fetched/decoded.
                // So if HALT is in W stage, we assert halt_out and stop updating pipeline registers.
                // But we already updated them above.
                // So we need to check for HALT in the current state before updating.
                
                // Let's restructure the always block to handle HALT properly.
                // If halt_out_reg is already 1, we do nothing.
                // If the instruction in X stage is HALT, we set halt_out_reg to 1 in the next cycle?
                // No, "after HALT executes". Execution completes in W stage.
                // So when HALT is in W stage, we assert halt_out.
                // And we freeze the pipeline.
                
                // But we already updated the pipeline registers above.
                // So we need to prevent the update if halt_out_reg is 1.
                
                // Let's move the halt check to the beginning of the else block.
            end
            
            // Update r0_out_reg
            // r0_out reflects the most recently written value of R[0].
            // If the instruction in W stage writes to R[0], update r0_out_reg.
            if (valid_w_reg && (rd_w_reg == 2'b00)) begin
                r0_out_reg <= data_w_reg;
            end
        end
    end
    
    // Assign outputs
    assign pc_out = pc_out_reg;
    assign halt_out = halt_out_reg;
    assign r0_out = r0_out_reg;

endmodule
```