← all candidates base

base_5

line-item PPA

passes_tests0/0
area_um2
fmax_mhz
ops_per_sec
switching_alpha
est_power_uW

per-program functional

no functional run (did not compile)

pipeline shape (heuristic register bank count)

F stage regs3
X stage regs13
W stage regs8
unstaged regs5

mistakes detected

extracted RTL

```verilog
// Branch convention: The instruction immediately following a BRcc is always executed (1 delay slot).
// If the branch is taken, the instruction in the delay slot is executed, and the next instruction
// fetched is the branch target. This is standard for simple 3-stage pipelines without prediction.
module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Registers
    // Stage F (Fetch)
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC calculated in F stage

    // Stage X (Execute)
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg [4:0] npc_x;
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [3:0] off_x;
    reg [1:0] cond_x;
    reg signed [7:0] alu_result_x;
    reg branch_taken_x;
    reg is_halt_x;
    reg [7:0] rs_val_x;

    // Stage W (Writeback)
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg [1:0] opcode_w;
    reg [1:0] rd_w;
    reg signed [7:0] alu_result_w;
    reg is_halt_w;
    reg branch_taken_w;
    reg [4:0] npc_w; // The PC that should be committed if branch was taken

    // Forwarding logic signals
    reg signed [7:0] rs_fwd;
    reg signed [7:0] rd_fwd; // Not strictly needed for ADDI/MOV as rd is destination, but good for completeness if we had 2 src regs. Here only MOV has rs.
    reg [1:0] fwd_sel; // 0: RF, 1: EX/MEM (X stage result), 2: MEM/WB (W stage result) - simplified to just X and W for 3 stage

    // Internal wires for forwarding decisions
    wire [1:0] fwd_src; 

    // ----------------------------------------------------------------
    // Reset and Pipeline Control
    // ----------------------------------------------------------------
    reg pipeline_flush;
    reg halt_internal;

    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 0;
            instr_f <= 0;
            npc_f <= 0;

            pc_x <= 0;
            instr_x <= 0;
            npc_x <= 0;
            opcode_x <= 0;
            rd_x <= 0;
            rs_x <= 0;
            imm_x <= 0;
            off_x <= 0;
            cond_x <= 0;
            alu_result_x <= 0;
            branch_taken_x <= 0;
            is_halt_x <= 0;
            rs_val_x <= 0;

            pc_w <= 0;
            instr_w <= 0;
            opcode_w <= 0;
            rd_w <= 0;
            alu_result_w <= 0;
            is_halt_w <= 0;
            branch_taken_w <= 0;
            npc_w <= 0;

            for (int i = 0; i < 4; i++) rf[i] <= 0;

            pc_out <= 0;
            halt_out <= 0;
            r0_out <= 0;
            halt_internal <= 0;
            pipeline_flush <= 0;
        end else if (halt_internal) begin
            // Freeze pipeline
            halt_out <= 1;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            halt_out <= 0; // Reset if not halted
            
            // Update R0 output
            if (opcode_w == 2'b00 || opcode_w == 2'b01) begin // ADDI or MOV
                if (rd_w == 0) begin
                    r0_out <= alu_result_w;
                end
            end else if (opcode_w == 2'b11) begin // HALT
                // r0_out remains unchanged
            end else begin
                // r0_out remains unchanged
            end

            // ----------------------------------------------------------------
            // Stage W: Writeback
            // ----------------------------------------------------------------
            if (pipeline_flush) begin
                // Flush W stage? Usually branch resolution happens in X, so W is just delayed.
                // If branch is taken in X, the instruction in W is the delay slot instruction.
                // It should execute. So we don't flush W on branch taken.
                // We only flush if we have a structural hazard or similar, but problem says no stalls.
                // Actually, if we detect a hazard that requires a stall, we would flush. 
                // But problem says "No stalls allowed for ALU->ALU". 
                // What about Load-Use? We don't have LOAD. Only ADDI/MOV/BR/HALT.
                // So no stalls needed.
                pc_w <= pc_w;
                instr_w <= instr_w;
                opcode_w <= opcode_w;
                rd_w <= rd_w;
                alu_result_w <= alu_result_w;
                is_halt_w <= is_halt_w;
                branch_taken_w <= branch_taken_w;
                npc_w <= npc_w;
            end else begin
                pc_w <= pc_x;
                instr_w <= instr_x;
                opcode_w <= opcode_x;
                rd_w <= rd_x;
                alu_result_w <= alu_result_x;
                is_halt_w <= is_halt_x;
                branch_taken_w <= branch_taken_x;
                npc_w <= npc_x;
            end

            // Execute W stage logic
            if (!pipeline_flush) begin
                if (is_halt_w) begin
                    halt_internal <= 1;
                end else if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                    // Write result to RF
                    if (rd_w < 4) begin
                        rf[rd_w] <= alu_result_w;
                    end
                end
                // BRcc and HALT do not write to RF
            end

            // ----------------------------------------------------------------
            // Stage X: Decode/Execute
            // ----------------------------------------------------------------
            if (pipeline_flush) begin
                // If we flush X, we need to decide what to put in.
                // Standard bubble: put NOP-like values.
                pc_x <= 0;
                instr_x <= 0;
                npc_x <= 0;
                opcode_x <= 0;
                rd_x <= 0;
                rs_x <= 0;
                imm_x <= 0;
                off_x <= 0;
                cond_x <= 0;
                alu_result_x <= 0;
                branch_taken_x <= 0;
                is_halt_x <= 0;
                rs_val_x <= 0;
            end else begin
                // Move from F to X
                pc_x <= pc_f;
                instr_x <= instr_f;
                npc_x <= npc_f;

                // Decode
                opcode_x <= instr_f[7:6];
                if (instr_f[7:6] == 2'b00) begin // ADDI
                    rd_x <= instr_f[5:4];
                    imm_x <= instr_f[3:0];
                    rs_x <= 0; // Unused
                    off_x <= 0;
                    cond_x <= 0;
                end else if (instr_f[7:6] == 2'b01) begin // MOV
                    rd_x <= instr_f[5:4];
                    rs_x <= instr_f[3:2];
                    imm_x <= 0;
                    off_x <= 0;
                    cond_x <= 0;
                end else if (instr_f[7:6] == 2'b10) begin // BRcc
                    cond_x <= instr_f[5:4];
                    off_x <= instr_f[3:0];
                    rd_x <= 0;
                    rs_x <= 0;
                    imm_x <= 0;
                end else begin // HALT
                    rd_x <= 0;
                    rs_x <= 0;
                    imm_x <= 0;
                    off_x <= 0;
                    cond_x <= 0;
                end

                // Execute
                is_halt_x <= (instr_f[7:6] == 2'b11);
                branch_taken_x <= 0;

                // Forwarding for RS in X stage
                // We need to determine the value of RS for MOV
                // Sources: RF, X-stage result (if it was just computed), W-stage result (if it was just committed)
                // Since we are in X stage, the "previous" instruction is in W stage.
                // The "current" instruction is in X stage.
                // Forwarding logic:
                // If the instruction in W stage writes to the register that X stage reads, use W result.
                // If the instruction in X stage writes to the register that X stage reads? No, X stage is computing.
                // Wait, forwarding is for the *current* X instruction's operands.
                // The operands for X instruction come from RF, unless a previous instruction (in W or X) is writing to that register.
                // Since we have a 3-stage pipeline:
                // Time T: Inst I in W, Inst I+1 in X, Inst I+2 in F
                // Inst I+1 in X needs RS.
                // If Inst I (in W) writes to RS, forward from W.
                // If Inst I+1 (in X) is an ADDI/MOV, it produces a result.
                // Does Inst I+1 read a register that it itself writes? No.
                // Does Inst I+1 read a register that Inst I+2 (in F) will read? That's for the next cycle.
                // So for X stage, we only forward from W stage.
                
                // Let's refine:
                // Current X instruction: opcode_x, rs_x
                // Previous W instruction: opcode_w, rd_w, alu_result_w
                // If opcode_w is ADDI or MOV, and rd_w == rs_x, and rd_w != 0 (R0 is hardwired 0? No, R0 is a register).
                // Note: R0 is a register. If rd_w is 0, we write to R0.
                // If rs_x is 0, we read R0.
                // So if rd_w == rs_x, we forward.
                
                // Also, what if the instruction in the *next* X stage (current F) needs forwarding?
                // That will be handled in the next cycle when F moves to X.
                // So here, we just compute rs_val_x for the current X instruction.
                
                // Check W stage for forwarding
                if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                    if (rd_w == rs_x) begin
                        rs_val_x <= alu_result_w;
                    end else begin
                        rs_val_x <= rf[rs_x];
                    end
                end else begin
                    rs_val_x <= rf[rs_x];
                end

                case (opcode_x)
                    2'b00: begin // ADDI
                        // R[rd] <= R[rd] + sign_ext(imm, 4)
                        // Note: ADDI reads R[rd] as source? "R[rd] <= R[rd] + ..."
                        // So we need the value of R[rd].
                        // Forwarding for RD in ADDI?
                        // ADDI uses RD as both source and destination.
                        // So we need to forward the value of RD if a previous instruction wrote to it.
                        // The previous instruction is in W stage.
                        // So we need to forward to the "source" of ADDI, which is RD.
                        
                        // Let's create a generic source value for the ALU.
                        // For ADDI, source is R[rd].
                        // For MOV, source is R[rs].
                        
                        // Re-evaluating forwarding for ADDI:
                        // ADDI: R[rd] <= R[rd] + imm
                        // We need R[rd].
                        // Check W stage: if opcode_w is ADDI/MOV and rd_w == rd_x, forward alu_result_w.
                        // Else check RF.
                        
                        signed [7:0] src_val;
                        if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                            if (rd_w == rd_x) begin
                                src_val <= alu_result_w;
                            end else begin
                                src_val <= rf[rd_x];
                            end
                        end else begin
                            src_val <= rf[rd_x];
                        end
                        
                        alu_result_x <= src_val + $signed({{4{imm_x[3]}}, imm_x});
                    end
                    2'b01: begin // MOV
                        // R[rd] <= R[rs]
                        // We already computed rs_val_x with forwarding from W.
                        alu_result_x <= rs_val_x;
                    end
                    2'b10: begin // BRcc
                        // Evaluate condition
                        // cond 00=always, 01=zero(R0), 10=neg(R0), 11=pos(R0)
                        // We need the current value of R0.
                        // R0 might have been updated by the instruction in W stage.
                        // So we need to forward R0 from W stage if W stage wrote to R0.
                        
                        signed [7:0] r0_val;
                        if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                            if (rd_w == 0) begin
                                r0_val <= alu_result_w;
                            end else begin
                                r0_val <= rf[0];
                            end
                        end else begin
                            r0_val <= rf[0];
                        end
                        
                        case (cond_x)
                            2'b00: branch_taken_x <= 1;
                            2'b01: branch_taken_x <= (r0_val == 0);
                            2'b10: branch_taken_x <= (r0_val < 0);
                            2'b11: branch_taken_x <= (r0_val > 0);
                            default: branch_taken_x <= 0;
                        endcase
                        
                        // Compute branch target
                        // PC <= PC + sign_ext(off,4) + 1
                        // PC here is pc_x (the PC of the branch instruction)
                        alu_result_x <= pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
                    end
                    2'b11: begin // HALT
                        alu_result_x <= 0;
                    end
                    default: begin
                        alu_result_x <= 0;
                    end
                endcase
            end

            // ----------------------------------------------------------------
            // Stage F: Fetch
            // ----------------------------------------------------------------
            if (pipeline_flush) begin
                // If we flush F, we bubble it.
                pc_f <= pc_f; // Or increment? Usually bubble means NOP.
                instr_f <= 0;
                npc_f <= pc_f + 1; // Default next PC
            end else begin
                // Determine next PC for the *next* fetch
                // The instruction currently in X stage might be a branch.
                // If branch is taken, the next PC (for the instruction after the delay slot) is the branch target.
                // The instruction in W stage is the delay slot instruction. It executes.
                // The instruction in F stage is the one after the delay slot.
                // So if X stage branch is taken, F stage should fetch from branch target.
                // If X stage branch is not taken, F stage should fetch from npc_x (which is pc_x + 1).
                
                // Wait, npc_x is computed in X stage as the branch target.
                // If branch is not taken, the next PC is pc_x + 1.
                // We stored npc_x in X stage. For non-branch instructions, npc_x is just pc_x + 1?
                // In X stage decode, we set npc_x <= npc_f.
                // For BRcc, we computed alu_result_x as the target.
                // So if branch_taken_x, the next PC is alu_result_x.
                // Else, the next PC is pc_x + 1.
                
                // But wait, npc_f was computed in the previous cycle for the instruction now in X.
                // For the instruction now in F, we need to compute its next PC.
                // The instruction in F is the one after the delay slot.
                // If the branch in X was taken, the instruction in F is the first instruction at the target.
                // So npc_f for the *next* cycle should be pc_f + 1.
                // And pc_f for the *next* cycle should be the branch target.
                
                // Let's trace:
                // Cycle T:
                // W: I-1
                // X: I (Branch)
                // F: I+1 (Delay Slot)
                
                // Cycle T+1:
                // W: I (Branch) -> Committed. If taken, PC becomes target.
                // X: I+1 (Delay Slot) -> Executed.
                // F: I+2 -> Fetched from Target+1 if branch taken, or I+1+1 if not.
                
                // So in Cycle T, when we update F for Cycle T+1:
                // pc_f_next = branch_taken_x ? alu_result_x : pc_x + 1;
                // instr_f_next = IMEM_INIT[pc_f_next * 8 +: 8];
                // npc_f_next = pc_f_next + 1;
                
                // What if X is not a branch?
                // Then pc_f_next = pc_x + 1;
                // Which is the same as npc_x (if npc_x was set to pc_x+1 in X stage).
                
                // So:
                if (opcode_x == 2'b10) begin
                    if (branch_taken_x) begin
                        pc_f <= alu_result_x;
                    end else begin
                        pc_f <= pc_x + 1;
                    end
                end else begin
                    pc_f <= pc_x + 1;
                end
                
                // Fetch instruction
                if (pc_f < 32) begin
                    instr_f <= IMEM_INIT[pc_f * 8 +: 8];
                end else begin
                    instr_f <= 0; // Out of bounds
                end
                
                // Compute npc_f for the instruction now in F (for the next cycle's X stage)
                npc_f <= pc_f + 1;
            end
        end
    end

    // Pipeline Flush Logic
    // When does a flush happen?
    // In this simple pipeline with forwarding and no stalls, we don't have data hazards that require stalls.
    // We have branch hazards.
    // The branch decision is made in X stage.
    // The instruction in F stage is the delay slot instruction.
    // If the branch is taken, the instruction in F stage is the delay slot instruction.
    // It should execute.
    // The instruction *after* the delay slot (which will be in F stage in the next cycle) should be the target.
    // We handled this by updating pc_f directly based on branch_taken_x.
    // So we don't need to flush the F stage in the sense of bubbling it.
    // We just redirect the fetch.
    // What about the W stage?
    // The instruction in W stage is I-1.
    // The instruction in X stage is I (Branch).
    // The instruction in F stage is I+1 (Delay Slot).
    // If branch is taken, I+1 executes.
    // I (Branch) commits in W stage next cycle.
    // So no flush is needed for W or X.
    // The only "flush" is the redirection of F.
    // So pipeline_flush is always 0?
    // Wait, what if we have a HALT in X stage?
    // If X stage is HALT, it will set halt_internal in W stage next cycle.
    // The instruction in F stage (delay slot) will execute.
    // Then the pipeline halts.
    // This is correct.
    
    // What if we have a branch in X stage, and the branch target is the same as the next sequential PC?
    // Then it's effectively not taken.
    // Our logic handles this.
    
    // So pipeline_flush is not needed for data hazards.
    // Is it needed for control hazards?
    // We resolved control hazards by redirecting F.
    // So pipeline_flush <= 0;
    
    // However, if we wanted to implement a stall, we would set pipeline_flush.
    // But the problem says no stalls for ALU->ALU.
    // And we don't have Load-Use hazards.
    // So pipeline_flush is always 0.
    
    // Let's double check the forwarding logic.
    // ADDI: R[rd] <= R[rd] + imm
    // MOV: R[rd] <= R[rs]
    // BRcc: uses R0.
    
    // Forwarding for ADDI:
    // If the previous instruction (in W) wrote to R[rd], we use that value.
    // This is correct.
    
    // Forwarding for MOV:
    // If the previous instruction (in W) wrote to R[rs], we use that value.
    // This is correct.
    
    // Forwarding for BRcc:
    // If the previous instruction (in W) wrote to R0, we use that value.
    // This is correct.
    
    // What if the instruction in X stage is an ADDI that writes to R0, and

raw model output

```verilog
// Branch convention: The instruction immediately following a BRcc is always executed (1 delay slot).
// If the branch is taken, the instruction in the delay slot is executed, and the next instruction
// fetched is the branch target. This is standard for simple 3-stage pipelines without prediction.
module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Registers
    // Stage F (Fetch)
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC calculated in F stage

    // Stage X (Execute)
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg [4:0] npc_x;
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [3:0] off_x;
    reg [1:0] cond_x;
    reg signed [7:0] alu_result_x;
    reg branch_taken_x;
    reg is_halt_x;
    reg [7:0] rs_val_x;

    // Stage W (Writeback)
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg [1:0] opcode_w;
    reg [1:0] rd_w;
    reg signed [7:0] alu_result_w;
    reg is_halt_w;
    reg branch_taken_w;
    reg [4:0] npc_w; // The PC that should be committed if branch was taken

    // Forwarding logic signals
    reg signed [7:0] rs_fwd;
    reg signed [7:0] rd_fwd; // Not strictly needed for ADDI/MOV as rd is destination, but good for completeness if we had 2 src regs. Here only MOV has rs.
    reg [1:0] fwd_sel; // 0: RF, 1: EX/MEM (X stage result), 2: MEM/WB (W stage result) - simplified to just X and W for 3 stage

    // Internal wires for forwarding decisions
    wire [1:0] fwd_src; 

    // ----------------------------------------------------------------
    // Reset and Pipeline Control
    // ----------------------------------------------------------------
    reg pipeline_flush;
    reg halt_internal;

    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 0;
            instr_f <= 0;
            npc_f <= 0;

            pc_x <= 0;
            instr_x <= 0;
            npc_x <= 0;
            opcode_x <= 0;
            rd_x <= 0;
            rs_x <= 0;
            imm_x <= 0;
            off_x <= 0;
            cond_x <= 0;
            alu_result_x <= 0;
            branch_taken_x <= 0;
            is_halt_x <= 0;
            rs_val_x <= 0;

            pc_w <= 0;
            instr_w <= 0;
            opcode_w <= 0;
            rd_w <= 0;
            alu_result_w <= 0;
            is_halt_w <= 0;
            branch_taken_w <= 0;
            npc_w <= 0;

            for (int i = 0; i < 4; i++) rf[i] <= 0;

            pc_out <= 0;
            halt_out <= 0;
            r0_out <= 0;
            halt_internal <= 0;
            pipeline_flush <= 0;
        end else if (halt_internal) begin
            // Freeze pipeline
            halt_out <= 1;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            halt_out <= 0; // Reset if not halted
            
            // Update R0 output
            if (opcode_w == 2'b00 || opcode_w == 2'b01) begin // ADDI or MOV
                if (rd_w == 0) begin
                    r0_out <= alu_result_w;
                end
            end else if (opcode_w == 2'b11) begin // HALT
                // r0_out remains unchanged
            end else begin
                // r0_out remains unchanged
            end

            // ----------------------------------------------------------------
            // Stage W: Writeback
            // ----------------------------------------------------------------
            if (pipeline_flush) begin
                // Flush W stage? Usually branch resolution happens in X, so W is just delayed.
                // If branch is taken in X, the instruction in W is the delay slot instruction.
                // It should execute. So we don't flush W on branch taken.
                // We only flush if we have a structural hazard or similar, but problem says no stalls.
                // Actually, if we detect a hazard that requires a stall, we would flush. 
                // But problem says "No stalls allowed for ALU->ALU". 
                // What about Load-Use? We don't have LOAD. Only ADDI/MOV/BR/HALT.
                // So no stalls needed.
                pc_w <= pc_w;
                instr_w <= instr_w;
                opcode_w <= opcode_w;
                rd_w <= rd_w;
                alu_result_w <= alu_result_w;
                is_halt_w <= is_halt_w;
                branch_taken_w <= branch_taken_w;
                npc_w <= npc_w;
            end else begin
                pc_w <= pc_x;
                instr_w <= instr_x;
                opcode_w <= opcode_x;
                rd_w <= rd_x;
                alu_result_w <= alu_result_x;
                is_halt_w <= is_halt_x;
                branch_taken_w <= branch_taken_x;
                npc_w <= npc_x;
            end

            // Execute W stage logic
            if (!pipeline_flush) begin
                if (is_halt_w) begin
                    halt_internal <= 1;
                end else if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                    // Write result to RF
                    if (rd_w < 4) begin
                        rf[rd_w] <= alu_result_w;
                    end
                end
                // BRcc and HALT do not write to RF
            end

            // ----------------------------------------------------------------
            // Stage X: Decode/Execute
            // ----------------------------------------------------------------
            if (pipeline_flush) begin
                // If we flush X, we need to decide what to put in.
                // Standard bubble: put NOP-like values.
                pc_x <= 0;
                instr_x <= 0;
                npc_x <= 0;
                opcode_x <= 0;
                rd_x <= 0;
                rs_x <= 0;
                imm_x <= 0;
                off_x <= 0;
                cond_x <= 0;
                alu_result_x <= 0;
                branch_taken_x <= 0;
                is_halt_x <= 0;
                rs_val_x <= 0;
            end else begin
                // Move from F to X
                pc_x <= pc_f;
                instr_x <= instr_f;
                npc_x <= npc_f;

                // Decode
                opcode_x <= instr_f[7:6];
                if (instr_f[7:6] == 2'b00) begin // ADDI
                    rd_x <= instr_f[5:4];
                    imm_x <= instr_f[3:0];
                    rs_x <= 0; // Unused
                    off_x <= 0;
                    cond_x <= 0;
                end else if (instr_f[7:6] == 2'b01) begin // MOV
                    rd_x <= instr_f[5:4];
                    rs_x <= instr_f[3:2];
                    imm_x <= 0;
                    off_x <= 0;
                    cond_x <= 0;
                end else if (instr_f[7:6] == 2'b10) begin // BRcc
                    cond_x <= instr_f[5:4];
                    off_x <= instr_f[3:0];
                    rd_x <= 0;
                    rs_x <= 0;
                    imm_x <= 0;
                end else begin // HALT
                    rd_x <= 0;
                    rs_x <= 0;
                    imm_x <= 0;
                    off_x <= 0;
                    cond_x <= 0;
                end

                // Execute
                is_halt_x <= (instr_f[7:6] == 2'b11);
                branch_taken_x <= 0;

                // Forwarding for RS in X stage
                // We need to determine the value of RS for MOV
                // Sources: RF, X-stage result (if it was just computed), W-stage result (if it was just committed)
                // Since we are in X stage, the "previous" instruction is in W stage.
                // The "current" instruction is in X stage.
                // Forwarding logic:
                // If the instruction in W stage writes to the register that X stage reads, use W result.
                // If the instruction in X stage writes to the register that X stage reads? No, X stage is computing.
                // Wait, forwarding is for the *current* X instruction's operands.
                // The operands for X instruction come from RF, unless a previous instruction (in W or X) is writing to that register.
                // Since we have a 3-stage pipeline:
                // Time T: Inst I in W, Inst I+1 in X, Inst I+2 in F
                // Inst I+1 in X needs RS.
                // If Inst I (in W) writes to RS, forward from W.
                // If Inst I+1 (in X) is an ADDI/MOV, it produces a result.
                // Does Inst I+1 read a register that it itself writes? No.
                // Does Inst I+1 read a register that Inst I+2 (in F) will read? That's for the next cycle.
                // So for X stage, we only forward from W stage.
                
                // Let's refine:
                // Current X instruction: opcode_x, rs_x
                // Previous W instruction: opcode_w, rd_w, alu_result_w
                // If opcode_w is ADDI or MOV, and rd_w == rs_x, and rd_w != 0 (R0 is hardwired 0? No, R0 is a register).
                // Note: R0 is a register. If rd_w is 0, we write to R0.
                // If rs_x is 0, we read R0.
                // So if rd_w == rs_x, we forward.
                
                // Also, what if the instruction in the *next* X stage (current F) needs forwarding?
                // That will be handled in the next cycle when F moves to X.
                // So here, we just compute rs_val_x for the current X instruction.
                
                // Check W stage for forwarding
                if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                    if (rd_w == rs_x) begin
                        rs_val_x <= alu_result_w;
                    end else begin
                        rs_val_x <= rf[rs_x];
                    end
                end else begin
                    rs_val_x <= rf[rs_x];
                end

                case (opcode_x)
                    2'b00: begin // ADDI
                        // R[rd] <= R[rd] + sign_ext(imm, 4)
                        // Note: ADDI reads R[rd] as source? "R[rd] <= R[rd] + ..."
                        // So we need the value of R[rd].
                        // Forwarding for RD in ADDI?
                        // ADDI uses RD as both source and destination.
                        // So we need to forward the value of RD if a previous instruction wrote to it.
                        // The previous instruction is in W stage.
                        // So we need to forward to the "source" of ADDI, which is RD.
                        
                        // Let's create a generic source value for the ALU.
                        // For ADDI, source is R[rd].
                        // For MOV, source is R[rs].
                        
                        // Re-evaluating forwarding for ADDI:
                        // ADDI: R[rd] <= R[rd] + imm
                        // We need R[rd].
                        // Check W stage: if opcode_w is ADDI/MOV and rd_w == rd_x, forward alu_result_w.
                        // Else check RF.
                        
                        signed [7:0] src_val;
                        if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                            if (rd_w == rd_x) begin
                                src_val <= alu_result_w;
                            end else begin
                                src_val <= rf[rd_x];
                            end
                        end else begin
                            src_val <= rf[rd_x];
                        end
                        
                        alu_result_x <= src_val + $signed({{4{imm_x[3]}}, imm_x});
                    end
                    2'b01: begin // MOV
                        // R[rd] <= R[rs]
                        // We already computed rs_val_x with forwarding from W.
                        alu_result_x <= rs_val_x;
                    end
                    2'b10: begin // BRcc
                        // Evaluate condition
                        // cond 00=always, 01=zero(R0), 10=neg(R0), 11=pos(R0)
                        // We need the current value of R0.
                        // R0 might have been updated by the instruction in W stage.
                        // So we need to forward R0 from W stage if W stage wrote to R0.
                        
                        signed [7:0] r0_val;
                        if (opcode_w == 2'b00 || opcode_w == 2'b01) begin
                            if (rd_w == 0) begin
                                r0_val <= alu_result_w;
                            end else begin
                                r0_val <= rf[0];
                            end
                        end else begin
                            r0_val <= rf[0];
                        end
                        
                        case (cond_x)
                            2'b00: branch_taken_x <= 1;
                            2'b01: branch_taken_x <= (r0_val == 0);
                            2'b10: branch_taken_x <= (r0_val < 0);
                            2'b11: branch_taken_x <= (r0_val > 0);
                            default: branch_taken_x <= 0;
                        endcase
                        
                        // Compute branch target
                        // PC <= PC + sign_ext(off,4) + 1
                        // PC here is pc_x (the PC of the branch instruction)
                        alu_result_x <= pc_x + $signed({{4{off_x[3]}}, off_x}) + 1;
                    end
                    2'b11: begin // HALT
                        alu_result_x <= 0;
                    end
                    default: begin
                        alu_result_x <= 0;
                    end
                endcase
            end

            // ----------------------------------------------------------------
            // Stage F: Fetch
            // ----------------------------------------------------------------
            if (pipeline_flush) begin
                // If we flush F, we bubble it.
                pc_f <= pc_f; // Or increment? Usually bubble means NOP.
                instr_f <= 0;
                npc_f <= pc_f + 1; // Default next PC
            end else begin
                // Determine next PC for the *next* fetch
                // The instruction currently in X stage might be a branch.
                // If branch is taken, the next PC (for the instruction after the delay slot) is the branch target.
                // The instruction in W stage is the delay slot instruction. It executes.
                // The instruction in F stage is the one after the delay slot.
                // So if X stage branch is taken, F stage should fetch from branch target.
                // If X stage branch is not taken, F stage should fetch from npc_x (which is pc_x + 1).
                
                // Wait, npc_x is computed in X stage as the branch target.
                // If branch is not taken, the next PC is pc_x + 1.
                // We stored npc_x in X stage. For non-branch instructions, npc_x is just pc_x + 1?
                // In X stage decode, we set npc_x <= npc_f.
                // For BRcc, we computed alu_result_x as the target.
                // So if branch_taken_x, the next PC is alu_result_x.
                // Else, the next PC is pc_x + 1.
                
                // But wait, npc_f was computed in the previous cycle for the instruction now in X.
                // For the instruction now in F, we need to compute its next PC.
                // The instruction in F is the one after the delay slot.
                // If the branch in X was taken, the instruction in F is the first instruction at the target.
                // So npc_f for the *next* cycle should be pc_f + 1.
                // And pc_f for the *next* cycle should be the branch target.
                
                // Let's trace:
                // Cycle T:
                // W: I-1
                // X: I (Branch)
                // F: I+1 (Delay Slot)
                
                // Cycle T+1:
                // W: I (Branch) -> Committed. If taken, PC becomes target.
                // X: I+1 (Delay Slot) -> Executed.
                // F: I+2 -> Fetched from Target+1 if branch taken, or I+1+1 if not.
                
                // So in Cycle T, when we update F for Cycle T+1:
                // pc_f_next = branch_taken_x ? alu_result_x : pc_x + 1;
                // instr_f_next = IMEM_INIT[pc_f_next * 8 +: 8];
                // npc_f_next = pc_f_next + 1;
                
                // What if X is not a branch?
                // Then pc_f_next = pc_x + 1;
                // Which is the same as npc_x (if npc_x was set to pc_x+1 in X stage).
                
                // So:
                if (opcode_x == 2'b10) begin
                    if (branch_taken_x) begin
                        pc_f <= alu_result_x;
                    end else begin
                        pc_f <= pc_x + 1;
                    end
                end else begin
                    pc_f <= pc_x + 1;
                end
                
                // Fetch instruction
                if (pc_f < 32) begin
                    instr_f <= IMEM_INIT[pc_f * 8 +: 8];
                end else begin
                    instr_f <= 0; // Out of bounds
                end
                
                // Compute npc_f for the instruction now in F (for the next cycle's X stage)
                npc_f <= pc_f + 1;
            end
        end
    end

    // Pipeline Flush Logic
    // When does a flush happen?
    // In this simple pipeline with forwarding and no stalls, we don't have data hazards that require stalls.
    // We have branch hazards.
    // The branch decision is made in X stage.
    // The instruction in F stage is the delay slot instruction.
    // If the branch is taken, the instruction in F stage is the delay slot instruction.
    // It should execute.
    // The instruction *after* the delay slot (which will be in F stage in the next cycle) should be the target.
    // We handled this by updating pc_f directly based on branch_taken_x.
    // So we don't need to flush the F stage in the sense of bubbling it.
    // We just redirect the fetch.
    // What about the W stage?
    // The instruction in W stage is I-1.
    // The instruction in X stage is I (Branch).
    // The instruction in F stage is I+1 (Delay Slot).
    // If branch is taken, I+1 executes.
    // I (Branch) commits in W stage next cycle.
    // So no flush is needed for W or X.
    // The only "flush" is the redirection of F.
    // So pipeline_flush is always 0?
    // Wait, what if we have a HALT in X stage?
    // If X stage is HALT, it will set halt_internal in W stage next cycle.
    // The instruction in F stage (delay slot) will execute.
    // Then the pipeline halts.
    // This is correct.
    
    // What if we have a branch in X stage, and the branch target is the same as the next sequential PC?
    // Then it's effectively not taken.
    // Our logic handles this.
    
    // So pipeline_flush is not needed for data hazards.
    // Is it needed for control hazards?
    // We resolved control hazards by redirecting F.
    // So pipeline_flush <= 0;
    
    // However, if we wanted to implement a stall, we would set pipeline_flush.
    // But the problem says no stalls for ALU->ALU.
    // And we don't have Load-Use hazards.
    // So pipeline_flush is always 0.
    
    // Let's double check the forwarding logic.
    // ADDI: R[rd] <= R[rd] + imm
    // MOV: R[rd] <= R[rs]
    // BRcc: uses R0.
    
    // Forwarding for ADDI:
    // If the previous instruction (in W) wrote to R[rd], we use that value.
    // This is correct.
    
    // Forwarding for MOV:
    // If the previous instruction (in W) wrote to R[rs], we use that value.
    // This is correct.
    
    // Forwarding for BRcc:
    // If the previous instruction (in W) wrote to R0, we use that value.
    // This is correct.
    
    // What if the instruction in X stage is an ADDI that writes to R0, and