← all candidates base

base_7

line-item PPA

passes_tests0/0
area_um2
fmax_mhz
ops_per_sec
switching_alpha
est_power_uW

per-program functional

no functional run (did not compile)

pipeline shape (heuristic register bank count)

F stage regs3
X stage regs16
W stage regs17
unstaged regs6

mistakes detected

(no automated diagnostics fired)

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Stage Registers
    // Fetch Stage
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg       halt_f; // propagate halt signal through pipeline

    // Execute Stage
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg       halt_x;
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [1:0] cond_x;
    reg signed [7:0] alu_result_x;
    reg       branch_taken_x;
    reg [4:0] branch_target_x;
    reg       is_branch_x;
    reg       is_halt_x;

    // Writeback Stage
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg       halt_w;
    reg [1:0] opcode_w;
    reg [1:0] rd_w;
    reg [1:0] rs_w;
    reg [3:0] imm_w;
    reg [1:0] cond_w;
    reg signed [7:0] alu_result_w;
    reg       branch_taken_w;
    reg [4:0] branch_target_w;
    reg       is_branch_w;
    reg       is_halt_w;
    reg       wb_en_w; // writeback enable

    // Forwarding Logic
    // We need to forward results from the Execute stage (X) and Writeback stage (W)
    // to the Execute stage inputs.
    // Since it's a 3-stage pipeline:
    // Stage F -> Stage X -> Stage W
    // Hazards occur when an instruction in X needs a register that is being written
    // by an instruction in W or X.
    
    // Forwarding signals
    reg [1:0] fwd_a; // 00: no forward, 01: forward from X, 10: forward from W
    reg [1:0] fwd_b; // 00: no forward, 01: forward from X, 10: forward from W
    
    // Data to forward
    reg signed [7:0] data_fwd_x; // Result from X stage ALU
    reg signed [7:0] data_fwd_w; // Result from W stage ALU (or just the written value)
    reg [1:0] rd_fwd_x;          // RD of instruction in X stage
    reg [1:0] rd_fwd_w;          // RD of instruction in W stage
    reg       valid_x;           // Is X stage holding a valid instruction?
    reg       valid_w;           // Is W stage holding a valid instruction?

    // Helper function for sign extension
    function signed [7:0] sign_ext4;
        input [3:0] val;
        begin
            sign_ext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Reset Logic
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'b00000;
            instr_f <= 8'b0;
            halt_f <= 1'b0;

            pc_x <= 5'b00000;
            instr_x <= 8'b0;
            halt_x <= 1'b0;
            opcode_x <= 2'b00;
            rd_x <= 2'b00;
            rs_x <= 2'b00;
            imm_x <= 4'b0000;
            cond_x <= 2'b00;
            alu_result_x <= 8'b0;
            branch_taken_x <= 1'b0;
            branch_target_x <= 5'b00000;
            is_branch_x <= 1'b0;
            is_halt_x <= 1'b0;

            pc_w <= 5'b00000;
            instr_w <= 8'b0;
            halt_w <= 1'b0;
            opcode_w <= 2'b00;
            rd_w <= 2'b00;
            rs_w <= 2'b00;
            imm_w <= 4'b0000;
            cond_w <= 2'b00;
            alu_result_w <= 8'b0;
            branch_taken_w <= 1'b0;
            branch_target_w <= 5'b00000;
            is_branch_w <= 1'b0;
            is_halt_w <= 1'b0;
            wb_en_w <= 1'b0;

            pc_out <= 5'b00000;
            halt_out <= 1'b0;
            r0_out <= 8'b0;

            for (integer i = 0; i < 4; i = i + 1) begin
                rf[i] <= 8'b0;
            end

            fwd_a <= 2'b00;
            fwd_b <= 2'b00;
            data_fwd_x <= 8'b0;
            data_fwd_w <= 8'b0;
            rd_fwd_x <= 2'b00;
            rd_fwd_w <= 2'b00;
            valid_x <= 1'b0;
            valid_w <= 1'b0;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            halt_out <= halt_out | halt_w; // Once halted, stay halted
            // r0_out reflects the most recently written value of R[0]
            // If W stage writes to R0, update r0_out
            if (wb_en_w && (rd_w == 2'b00)) begin
                r0_out <= alu_result_w;
            end else if (rst) begin // Handled in reset block, but for safety
                r0_out <= 8'b0;
            end

            // Update Register File
            if (wb_en_w) begin
                rf[rd_w] <= alu_result_w;
            end

            // Update Forwarding Data
            data_fwd_x <= alu_result_x;
            rd_fwd_x <= rd_x;
            valid_x <= ~halt_x; // Valid if not halted

            data_fwd_w <= alu_result_w;
            rd_fwd_w <= rd_w;
            valid_w <= ~halt_w; // Valid if not halted

            // Stage F: Fetch
            if (!halt_out) begin
                // Determine next PC for fetch
                // If the previous instruction (now in W) was a taken branch, 
                // the PC should be the branch target.
                // Note: The instruction in the delay slot (now in X) is executed,
                // but the PC for the *next* fetch after the delay slot is the target.
                // However, standard 1-delay-slot pipelines usually set the PC for the
                // fetch stage based on the branch resolution in the W stage.
                
                // Let's refine the PC update logic.
                // PC_F is the PC of the instruction currently being fetched.
                // At the end of the cycle, we decide what PC_F will be next.
                
                if (halt_w) begin
                    pc_f <= pc_f; // Freeze
                end else if (is_branch_w && branch_taken_w) begin
                    pc_f <= branch_target_w;
                end else begin
                    pc_f <= pc_f + 1'b1;
                end
                
                // Fetch instruction
                instr_f <= IMEM_INIT[pc_f * 8 +: 8];
                halt_f <= 1'b0;
            end else begin
                // Already halted, freeze
                pc_f <= pc_f;
                instr_f <= instr_f;
                halt_f <= 1'b1;
            end

            // Stage X: Decode/Execute
            if (halt_out) begin
                // Freeze X stage if halted
                // But we need to be careful. If halt is asserted in W, 
                // X should also freeze.
                // Actually, halt_out is updated from halt_w. 
                // If halt_w is 1, halt_out becomes 1 in the same cycle.
                // So next cycle, halt_out is 1.
                // We should freeze X if halt_out is 1 OR if halt_x was 1?
                // Let's just freeze if halt_out is 1.
                // Wait, if halt_out is 1, we don't want to advance the pipeline.
                // So we keep the current X state.
            end else begin
                // Move F to X
                pc_x <= pc_f;
                instr_x <= instr_f;
                halt_x <= halt_f;
                
                // Decode
                opcode_x <= instr_f[7:6];
                if (instr_f[7:6] == 2'b00) begin // ADDI
                    rd_x <= instr_f[5:4];
                    rs_x <= 2'b00; // Unused
                    imm_x <= instr_f[3:0];
                    cond_x <= 2'b00; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                end else if (instr_f[7:6] == 2'b01) begin // MOV
                    rd_x <= instr_f[5:4];
                    rs_x <= instr_f[3:2];
                    imm_x <= 4'b0000; // Unused
                    cond_x <= 2'b00; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                end else if (instr_f[7:6] == 2'b10) begin // BRcc
                    cond_x <= instr_f[5:4];
                    rd_x <= 2'b00; // Unused
                    rs_x <= 2'b00; // Unused
                    imm_x <= instr_f[3:0];
                    is_branch_x <= 1'b1;
                    is_halt_x <= 1'b0;
                end else begin // HALT
                    opcode_x <= 2'b11;
                    rd_x <= 2'b00;
                    rs_x <= 2'b00;
                    imm_x <= 4'b0000;
                    cond_x <= 2'b00;
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b1;
                end

                // Execute
                if (is_branch_x) begin
                    // Calculate branch target
                    branch_target_x <= pc_x + sign_ext4(imm_x) + 1'b1;
                    // Evaluate condition
                    // We need the current value of R0 for condition check.
                    // R0 might be updated by the instruction in W stage.
                    // So we should use forwarding for R0 read here?
                    // The condition depends on R0.
                    // Let's read R0 with forwarding.
                    signed [7:0] r0_val;
                    // Forwarding for R0 read in branch condition
                    if (valid_w && (rd_fwd_w == 2'b00)) begin
                        r0_val = data_fwd_w;
                    end else if (valid_x && (rd_fwd_x == 2'b00)) begin
                        r0_val = data_fwd_x;
                    end else begin
                        r0_val = rf[0];
                    end
                    
                    if (cond_x == 2'b00) begin // Always
                        branch_taken_x <= 1'b1;
                    end else if (cond_x == 2'b01) begin // Zero
                        branch_taken_x <= (r0_val == 8'b0);
                    end else if (cond_x == 2'b10) begin // Negative
                        branch_taken_x <= (r0_val[7] == 1'b1);
                    end else begin // Positive (non-zero and positive? or just > 0?)
                        // "pos(R0)" usually means > 0.
                        branch_taken_x <= (r0_val > 8'b0);
                    end
                    alu_result_x <= 8'b0; // Unused
                end else if (is_halt_x) begin
                    alu_result_x <= 8'b0;
                    branch_taken_x <= 1'b0;
                    branch_target_x <= 5'b0;
                end else begin
                    // ADDI or MOV
                    // Read source register (for MOV) or use immediate (for ADDI)
                    // For ADDI: R[rd] <= R[rd] + imm
                    // For MOV: R[rd] <= R[rs]
                    
                    signed [7:0] src_val;
                    if (opcode_x == 2'b00) begin // ADDI
                        // Read R[rd]
                        if (valid_w && (rd_fwd_w == rd_x)) begin
                            src_val = data_fwd_w;
                        end else if (valid_x && (rd_fwd_x == rd_x)) begin
                            src_val = data_fwd_x;
                        end else begin
                            src_val = rf[rd_x];
                        end
                        alu_result_x <= src_val + sign_ext4(imm_x);
                    end else if (opcode_x == 2'b01) begin // MOV
                        // Read R[rs]
                        if (valid_w && (rd_fwd_w == rs_x)) begin
                            src_val = data_fwd_w;
                        end else if (valid_x && (rd_fwd_x == rs_x)) begin
                            src_val = data_fwd_x;
                        end else begin
                            src_val = rf[rs_x];
                        end
                        alu_result_x <= src_val;
                    end else begin
                        alu_result_x <= 8'b0;
                    end
                    branch_taken_x <= 1'b0;
                    branch_target_x <= 5'b0;
                end
            end

            // Stage W: Writeback
            if (halt_out) begin
                // Freeze W stage
            end else begin
                // Move X to W
                pc_w <= pc_x;
                instr_w <= instr_x;
                halt_w <= halt_x;
                opcode_w <= opcode_x;
                rd_w <= rd_x;
                rs_w <= rs_x;
                imm_w <= imm_x;
                cond_w <= cond_x;
                alu_result_w <= alu_result_x;
                branch_taken_w <= branch_taken_x;
                branch_target_w <= branch_target_x;
                is_branch_w <= is_branch_x;
                is_halt_w <= is_halt_x;
                
                // Determine if writeback should occur
                if (is_halt_w) begin
                    wb_en_w <= 1'b0;
                end else if (is_branch_w) begin
                    // If branch is taken, the delay slot instruction's result is discarded?
                    // The problem says: "the instruction after the branch always executes"
                    // "document whichever convention you choose"
                    // Convention: Delay slot instruction executes and writes back normally.
                    // The branch target is loaded into PC for the *next* fetch.
                    // So we do write back the delay slot instruction.
                    // However, if the branch is taken, do we discard the writeback?
                    // Standard MIPS: Delay slot instruction is committed.
                    // So wb_en_w should be 1 if it's a normal instruction.
                    // But wait, the instruction in W stage is the one that was in X stage.
                    // If the instruction in W stage is a branch, it doesn't write back.
                    // If the instruction in W stage is a normal instruction (ADDI/MOV), it writes back.
                    // The branch taken status in W stage refers to the branch instruction itself.
                    // So if is_branch_w is 1, it's a branch instruction, so no writeback.
                    wb_en_w <= 1'b0;
                end else begin
                    // ADDI or MOV
                    wb_en_w <= 1'b1;
                end
            end

            // Forwarding Logic for Execute Stage
            // We need to forward data to the Execute stage inputs.
            // The inputs are R[rs] for MOV and R[rd] for ADDI.
            // We already handled the actual data selection in the Execute block above.
            // But we need to update the forwarding registers for the next cycle.
            // The forwarding logic determines which data to use for the *next* Execute stage.
            // So we update fwd_a and fwd_b here based on the current X and W stages.
            
            // For the next cycle's Execute stage, we need to know if we should forward from X or W.
            // The instruction in the next X stage will be the current F stage instruction.
            // Let's decode the current F stage instruction to see what it needs.
            reg [1:0] next_rd;
            reg [1:0] next_rs;
            reg       next_is_mov;
            reg       next_is_addi;
            
            if (instr_f[7:6] == 2'b00) begin // ADDI
                next_rd = instr_f[5:4];
                next_rs = 2'b00;
                next_is_addi = 1'b1;
                next_is_mov = 1'b0;
            end else if (instr_f[7:6] == 2'b01) begin // MOV
                next_rd = instr_f[5:4];
                next_rs = instr_f[3:2];
                next_is_addi = 1'b0;
                next_is_mov = 1'b1;
            end else begin
                next_rd = 2'b00;
                next_rs = 2'b00;
                next_is_addi = 1'b0;
                next_is_mov = 1'b0;
            end
            
            // Determine forwarding for the next instruction (currently in F)
            // It will move to X in the next cycle.
            // We need to forward to its source registers.
            
            // Forwarding for ADDI: needs R[next_rd]
            if (next_is_addi) begin
                if (valid_w && (rd_fwd_w == next_rd)) begin
                    fwd_a <= 2'b10; // Forward from W
                end else if (valid_x && (rd_fwd_x == next_rd)) begin
                    fwd_a <= 2'b01; // Forward from X
                end else begin
                    fwd_a <= 2'b00; // No forward
                end
                fwd_b <= 2'b00; // ADDI only has one source register
            end else if (next_is_mov) begin
                // Forwarding for MOV: needs R[next_rs]
                if (valid_w && (rd_fwd_w == next_rs)) begin
                    fwd_b <= 2'b10; // Forward from W
                end else if (valid_x && (rd_fwd_x == next_rs)) begin
                    fwd_b <= 2'b01; // Forward from X
                end else begin
                    fwd_b <= 2'b00; // No forward
                end
                fwd_a <= 2'b00; // MOV only has one source register
            end else begin
                fwd_a <= 2'b00;
                fwd_b <= 2'b00;
            end
        end
    end

endmodule

raw model output

```verilog
// Branch convention: 1-branch-delay-slot. The instruction immediately following
// a BRcc is always fetched and executed. If the branch is taken, the delay-slot
// instruction's result is discarded (not written back), but its side effects
// (e.g., register updates) are committed. The PC for the next fetch after the
// delay slot is the branch target.
module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Stage Registers
    // Fetch Stage
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg       halt_f; // propagate halt signal through pipeline

    // Execute Stage
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg       halt_x;
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [1:0] cond_x;
    reg signed [7:0] alu_result_x;
    reg       branch_taken_x;
    reg [4:0] branch_target_x;
    reg       is_branch_x;
    reg       is_halt_x;

    // Writeback Stage
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg       halt_w;
    reg [1:0] opcode_w;
    reg [1:0] rd_w;
    reg [1:0] rs_w;
    reg [3:0] imm_w;
    reg [1:0] cond_w;
    reg signed [7:0] alu_result_w;
    reg       branch_taken_w;
    reg [4:0] branch_target_w;
    reg       is_branch_w;
    reg       is_halt_w;
    reg       wb_en_w; // writeback enable

    // Forwarding Logic
    // We need to forward results from the Execute stage (X) and Writeback stage (W)
    // to the Execute stage inputs.
    // Since it's a 3-stage pipeline:
    // Stage F -> Stage X -> Stage W
    // Hazards occur when an instruction in X needs a register that is being written
    // by an instruction in W or X.
    
    // Forwarding signals
    reg [1:0] fwd_a; // 00: no forward, 01: forward from X, 10: forward from W
    reg [1:0] fwd_b; // 00: no forward, 01: forward from X, 10: forward from W
    
    // Data to forward
    reg signed [7:0] data_fwd_x; // Result from X stage ALU
    reg signed [7:0] data_fwd_w; // Result from W stage ALU (or just the written value)
    reg [1:0] rd_fwd_x;          // RD of instruction in X stage
    reg [1:0] rd_fwd_w;          // RD of instruction in W stage
    reg       valid_x;           // Is X stage holding a valid instruction?
    reg       valid_w;           // Is W stage holding a valid instruction?

    // Helper function for sign extension
    function signed [7:0] sign_ext4;
        input [3:0] val;
        begin
            sign_ext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Reset Logic
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'b00000;
            instr_f <= 8'b0;
            halt_f <= 1'b0;

            pc_x <= 5'b00000;
            instr_x <= 8'b0;
            halt_x <= 1'b0;
            opcode_x <= 2'b00;
            rd_x <= 2'b00;
            rs_x <= 2'b00;
            imm_x <= 4'b0000;
            cond_x <= 2'b00;
            alu_result_x <= 8'b0;
            branch_taken_x <= 1'b0;
            branch_target_x <= 5'b00000;
            is_branch_x <= 1'b0;
            is_halt_x <= 1'b0;

            pc_w <= 5'b00000;
            instr_w <= 8'b0;
            halt_w <= 1'b0;
            opcode_w <= 2'b00;
            rd_w <= 2'b00;
            rs_w <= 2'b00;
            imm_w <= 4'b0000;
            cond_w <= 2'b00;
            alu_result_w <= 8'b0;
            branch_taken_w <= 1'b0;
            branch_target_w <= 5'b00000;
            is_branch_w <= 1'b0;
            is_halt_w <= 1'b0;
            wb_en_w <= 1'b0;

            pc_out <= 5'b00000;
            halt_out <= 1'b0;
            r0_out <= 8'b0;

            for (integer i = 0; i < 4; i = i + 1) begin
                rf[i] <= 8'b0;
            end

            fwd_a <= 2'b00;
            fwd_b <= 2'b00;
            data_fwd_x <= 8'b0;
            data_fwd_w <= 8'b0;
            rd_fwd_x <= 2'b00;
            rd_fwd_w <= 2'b00;
            valid_x <= 1'b0;
            valid_w <= 1'b0;
        end else begin
            // Update outputs
            pc_out <= pc_f;
            halt_out <= halt_out | halt_w; // Once halted, stay halted
            // r0_out reflects the most recently written value of R[0]
            // If W stage writes to R0, update r0_out
            if (wb_en_w && (rd_w == 2'b00)) begin
                r0_out <= alu_result_w;
            end else if (rst) begin // Handled in reset block, but for safety
                r0_out <= 8'b0;
            end

            // Update Register File
            if (wb_en_w) begin
                rf[rd_w] <= alu_result_w;
            end

            // Update Forwarding Data
            data_fwd_x <= alu_result_x;
            rd_fwd_x <= rd_x;
            valid_x <= ~halt_x; // Valid if not halted

            data_fwd_w <= alu_result_w;
            rd_fwd_w <= rd_w;
            valid_w <= ~halt_w; // Valid if not halted

            // Stage F: Fetch
            if (!halt_out) begin
                // Determine next PC for fetch
                // If the previous instruction (now in W) was a taken branch, 
                // the PC should be the branch target.
                // Note: The instruction in the delay slot (now in X) is executed,
                // but the PC for the *next* fetch after the delay slot is the target.
                // However, standard 1-delay-slot pipelines usually set the PC for the
                // fetch stage based on the branch resolution in the W stage.
                
                // Let's refine the PC update logic.
                // PC_F is the PC of the instruction currently being fetched.
                // At the end of the cycle, we decide what PC_F will be next.
                
                if (halt_w) begin
                    pc_f <= pc_f; // Freeze
                end else if (is_branch_w && branch_taken_w) begin
                    pc_f <= branch_target_w;
                end else begin
                    pc_f <= pc_f + 1'b1;
                end
                
                // Fetch instruction
                instr_f <= IMEM_INIT[pc_f * 8 +: 8];
                halt_f <= 1'b0;
            end else begin
                // Already halted, freeze
                pc_f <= pc_f;
                instr_f <= instr_f;
                halt_f <= 1'b1;
            end

            // Stage X: Decode/Execute
            if (halt_out) begin
                // Freeze X stage if halted
                // But we need to be careful. If halt is asserted in W, 
                // X should also freeze.
                // Actually, halt_out is updated from halt_w. 
                // If halt_w is 1, halt_out becomes 1 in the same cycle.
                // So next cycle, halt_out is 1.
                // We should freeze X if halt_out is 1 OR if halt_x was 1?
                // Let's just freeze if halt_out is 1.
                // Wait, if halt_out is 1, we don't want to advance the pipeline.
                // So we keep the current X state.
            end else begin
                // Move F to X
                pc_x <= pc_f;
                instr_x <= instr_f;
                halt_x <= halt_f;
                
                // Decode
                opcode_x <= instr_f[7:6];
                if (instr_f[7:6] == 2'b00) begin // ADDI
                    rd_x <= instr_f[5:4];
                    rs_x <= 2'b00; // Unused
                    imm_x <= instr_f[3:0];
                    cond_x <= 2'b00; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                end else if (instr_f[7:6] == 2'b01) begin // MOV
                    rd_x <= instr_f[5:4];
                    rs_x <= instr_f[3:2];
                    imm_x <= 4'b0000; // Unused
                    cond_x <= 2'b00; // Unused
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b0;
                end else if (instr_f[7:6] == 2'b10) begin // BRcc
                    cond_x <= instr_f[5:4];
                    rd_x <= 2'b00; // Unused
                    rs_x <= 2'b00; // Unused
                    imm_x <= instr_f[3:0];
                    is_branch_x <= 1'b1;
                    is_halt_x <= 1'b0;
                end else begin // HALT
                    opcode_x <= 2'b11;
                    rd_x <= 2'b00;
                    rs_x <= 2'b00;
                    imm_x <= 4'b0000;
                    cond_x <= 2'b00;
                    is_branch_x <= 1'b0;
                    is_halt_x <= 1'b1;
                end

                // Execute
                if (is_branch_x) begin
                    // Calculate branch target
                    branch_target_x <= pc_x + sign_ext4(imm_x) + 1'b1;
                    // Evaluate condition
                    // We need the current value of R0 for condition check.
                    // R0 might be updated by the instruction in W stage.
                    // So we should use forwarding for R0 read here?
                    // The condition depends on R0.
                    // Let's read R0 with forwarding.
                    signed [7:0] r0_val;
                    // Forwarding for R0 read in branch condition
                    if (valid_w && (rd_fwd_w == 2'b00)) begin
                        r0_val = data_fwd_w;
                    end else if (valid_x && (rd_fwd_x == 2'b00)) begin
                        r0_val = data_fwd_x;
                    end else begin
                        r0_val = rf[0];
                    end
                    
                    if (cond_x == 2'b00) begin // Always
                        branch_taken_x <= 1'b1;
                    end else if (cond_x == 2'b01) begin // Zero
                        branch_taken_x <= (r0_val == 8'b0);
                    end else if (cond_x == 2'b10) begin // Negative
                        branch_taken_x <= (r0_val[7] == 1'b1);
                    end else begin // Positive (non-zero and positive? or just > 0?)
                        // "pos(R0)" usually means > 0.
                        branch_taken_x <= (r0_val > 8'b0);
                    end
                    alu_result_x <= 8'b0; // Unused
                end else if (is_halt_x) begin
                    alu_result_x <= 8'b0;
                    branch_taken_x <= 1'b0;
                    branch_target_x <= 5'b0;
                end else begin
                    // ADDI or MOV
                    // Read source register (for MOV) or use immediate (for ADDI)
                    // For ADDI: R[rd] <= R[rd] + imm
                    // For MOV: R[rd] <= R[rs]
                    
                    signed [7:0] src_val;
                    if (opcode_x == 2'b00) begin // ADDI
                        // Read R[rd]
                        if (valid_w && (rd_fwd_w == rd_x)) begin
                            src_val = data_fwd_w;
                        end else if (valid_x && (rd_fwd_x == rd_x)) begin
                            src_val = data_fwd_x;
                        end else begin
                            src_val = rf[rd_x];
                        end
                        alu_result_x <= src_val + sign_ext4(imm_x);
                    end else if (opcode_x == 2'b01) begin // MOV
                        // Read R[rs]
                        if (valid_w && (rd_fwd_w == rs_x)) begin
                            src_val = data_fwd_w;
                        end else if (valid_x && (rd_fwd_x == rs_x)) begin
                            src_val = data_fwd_x;
                        end else begin
                            src_val = rf[rs_x];
                        end
                        alu_result_x <= src_val;
                    end else begin
                        alu_result_x <= 8'b0;
                    end
                    branch_taken_x <= 1'b0;
                    branch_target_x <= 5'b0;
                end
            end

            // Stage W: Writeback
            if (halt_out) begin
                // Freeze W stage
            end else begin
                // Move X to W
                pc_w <= pc_x;
                instr_w <= instr_x;
                halt_w <= halt_x;
                opcode_w <= opcode_x;
                rd_w <= rd_x;
                rs_w <= rs_x;
                imm_w <= imm_x;
                cond_w <= cond_x;
                alu_result_w <= alu_result_x;
                branch_taken_w <= branch_taken_x;
                branch_target_w <= branch_target_x;
                is_branch_w <= is_branch_x;
                is_halt_w <= is_halt_x;
                
                // Determine if writeback should occur
                if (is_halt_w) begin
                    wb_en_w <= 1'b0;
                end else if (is_branch_w) begin
                    // If branch is taken, the delay slot instruction's result is discarded?
                    // The problem says: "the instruction after the branch always executes"
                    // "document whichever convention you choose"
                    // Convention: Delay slot instruction executes and writes back normally.
                    // The branch target is loaded into PC for the *next* fetch.
                    // So we do write back the delay slot instruction.
                    // However, if the branch is taken, do we discard the writeback?
                    // Standard MIPS: Delay slot instruction is committed.
                    // So wb_en_w should be 1 if it's a normal instruction.
                    // But wait, the instruction in W stage is the one that was in X stage.
                    // If the instruction in W stage is a branch, it doesn't write back.
                    // If the instruction in W stage is a normal instruction (ADDI/MOV), it writes back.
                    // The branch taken status in W stage refers to the branch instruction itself.
                    // So if is_branch_w is 1, it's a branch instruction, so no writeback.
                    wb_en_w <= 1'b0;
                end else begin
                    // ADDI or MOV
                    wb_en_w <= 1'b1;
                end
            end

            // Forwarding Logic for Execute Stage
            // We need to forward data to the Execute stage inputs.
            // The inputs are R[rs] for MOV and R[rd] for ADDI.
            // We already handled the actual data selection in the Execute block above.
            // But we need to update the forwarding registers for the next cycle.
            // The forwarding logic determines which data to use for the *next* Execute stage.
            // So we update fwd_a and fwd_b here based on the current X and W stages.
            
            // For the next cycle's Execute stage, we need to know if we should forward from X or W.
            // The instruction in the next X stage will be the current F stage instruction.
            // Let's decode the current F stage instruction to see what it needs.
            reg [1:0] next_rd;
            reg [1:0] next_rs;
            reg       next_is_mov;
            reg       next_is_addi;
            
            if (instr_f[7:6] == 2'b00) begin // ADDI
                next_rd = instr_f[5:4];
                next_rs = 2'b00;
                next_is_addi = 1'b1;
                next_is_mov = 1'b0;
            end else if (instr_f[7:6] == 2'b01) begin // MOV
                next_rd = instr_f[5:4];
                next_rs = instr_f[3:2];
                next_is_addi = 1'b0;
                next_is_mov = 1'b1;
            end else begin
                next_rd = 2'b00;
                next_rs = 2'b00;
                next_is_addi = 1'b0;
                next_is_mov = 1'b0;
            end
            
            // Determine forwarding for the next instruction (currently in F)
            // It will move to X in the next cycle.
            // We need to forward to its source registers.
            
            // Forwarding for ADDI: needs R[next_rd]
            if (next_is_addi) begin
                if (valid_w && (rd_fwd_w == next_rd)) begin
                    fwd_a <= 2'b10; // Forward from W
                end else if (valid_x && (rd_fwd_x == next_rd)) begin
                    fwd_a <= 2'b01; // Forward from X
                end else begin
                    fwd_a <= 2'b00; // No forward
                end
                fwd_b <= 2'b00; // ADDI only has one source register
            end else if (next_is_mov) begin
                // Forwarding for MOV: needs R[next_rs]
                if (valid_w && (rd_fwd_w == next_rs)) begin
                    fwd_b <= 2'b10; // Forward from W
                end else if (valid_x && (rd_fwd_x == next_rs)) begin
                    fwd_b <= 2'b01; // Forward from X
                end else begin
                    fwd_b <= 2'b00; // No forward
                end
                fwd_a <= 2'b00; // MOV only has one source register
            end else begin
                fwd_a <= 2'b00;
                fwd_b <= 2'b00;
            end
        end
    end

endmodule
```