← all candidates sft

sft_5

line-item PPA

passes_tests0/0
area_um2
fmax_mhz
ops_per_sec
switching_alpha
est_power_uW

per-program functional

no functional run (did not compile)

pipeline shape (heuristic register bank count)

F stage regs4
X stage regs11
W stage regs8
unstaged regs2

mistakes detected

extracted RTL

```verilog
// Branch convention: The instruction immediately following a BRcc is always
// fetched and executed (branch delay slot). If the branch is taken, the
// instruction fetched into the delay slot is effectively discarded from the
// architectural state (its writeback is suppressed or its result is ignored
// for subsequent reads). This implementation suppresses the writeback of the
// delay slot instruction if the branch is taken.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Stage Registers
    // Stage F (Fetch)
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg       valid_f;

    // Stage X (Execute)
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg       valid_x;
    // Forwarded/Computed values for Writeback
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg signed [7:0] result_x;
    reg is_branch_x;
    reg branch_taken_x;
    reg is_halt_x;

    // Stage W (Writeback)
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg       valid_w;
    reg [1:0] rd_w;
    reg signed [7:0] result_w;
    reg is_branch_w;
    reg branch_taken_w; // Used to suppress writeback of delay slot
    reg is_halt_w;

    // Helper: Sign extend 4-bit immediate to 8-bit
    function signed [7:0] sext4;
        input [3:0] val;
        begin
            sext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: Read register with forwarding
    // Forwarding logic:
    // 1. If W stage is writing to the same register and is valid (and not a branch delay slot suppression case that invalidates the write), use W result.
    // 2. If X stage is writing to the same register and is valid (and not a branch delay slot suppression case), use X result.
    // 3. Otherwise, use register file.
    // Note: Branch delay slot suppression means if the previous instruction was a taken branch, the current instruction (in X or W) is the delay slot.
    // However, standard MIPS-like delay slot execution means the delay slot instruction executes.
    // The prompt says: "instruction after the branch always executes".
    // It also says "document whichever convention you choose".
    // Convention chosen: The delay slot instruction executes fully. If the branch is taken, the PC updates to the target.
    // The delay slot instruction's writeback happens normally.
    // WAIT: "1 branch delay slot is acceptable ... document whichever convention you choose".
    // Usually, in a simple 3-stage pipeline without complex hazard detection for branches,
    // the "delay slot" is just the instruction that was already fetched.
    // If I implement "suppress writeback of delay slot if branch taken", I need to track if the current W instruction is the delay slot for a taken branch.
    // Let's stick to the simplest valid interpretation:
    // The instruction fetched at PC+1 is the delay slot. It executes.
    // If the branch is taken, the PC becomes Target. The delay slot instruction is executed.
    // Does its result count? Yes, usually.
    // BUT, if I suppress it, it's also a valid "convention" (like ARM conditional execution or MIPS where you put NOPs).
    // Let's look at the forwarding requirement: "Forwarding REQUIRED for back-to-back ADDI / MOV".
    // This implies standard ALU forwarding.
    // Let's implement standard execution: Delay slot executes. Its writeback occurs.
    // If the branch is taken, the next fetch is from Target. The delay slot is "wasted" in terms of flow control but not in terms of side effects (register writes).
    // Actually, many simple MCUs discard the delay slot if taken. Let's look at the "freeze pipeline" for HALT.
    // Let's assume the delay slot instruction is executed and its results are committed.
    // Why? Because "instruction after the branch always executes".
    // If I suppress writeback, I am effectively saying it doesn't execute its side effects.
    // I will implement: Delay slot executes and commits.

    function signed [7:0] read_reg;
        input [1:0] addr;
        begin
            if (addr == 2'b00) read_reg = 8's0; // R0 is hardwired to 0? No, R0 is a register.
            // The prompt says R[0..3]. It doesn't say R0 is zero.
            // However, standard convention often has R0=0.
            // The prompt says "all R[i] <= 0" on reset.
            // It does not say R0 is hardwired to 0.
            // So R0 is a normal register.
            read_reg = rf[addr];
        end
    endfunction

    // Forwarding Logic for Execute Stage Reading RS
    // We need to read RS for ADDI (doesn't use RS) and MOV (uses RS).
    // ADDI: R[rd] <= R[rd] + imm. It reads R[rd].
    // MOV: R[rd] <= R[rs]. It reads R[rs].
    
    // Let's define a function that returns the value to use for a source register `src`
    // considering the current state of X and W stages.
    // Note: In a 3-stage pipeline, the value produced by X is available for the NEXT instruction's X stage?
    // No, X stage computes the result. W stage writes it.
    // Forwarding from X to X: The instruction in X has computed `result_x`. The instruction entering X (from F) needs a value.
    // If the instruction in W is writing to `src`, and it's valid, use `result_w`.
    // If the instruction in X is writing to `src`, and it's valid, use `result_x`.
    // Otherwise, use `rf[src]`.

    function signed [7:0] get_reg_val;
        input [1:0] src;
        begin
            // Priority: W stage (just finished) > X stage (currently computing) > Register File
            // However, W stage writes happen at the end of the cycle.
            // X stage reads happen at the beginning of the cycle.
            // So we should forward from X if the instruction in X is producing the value for the instruction in F->X?
            // No, the instruction in X is the one *before* the current one.
            // If Inst[i] is in X, and Inst[i+1] is in F (moving to X), Inst[i+1] needs a value.
            // If Inst[i] writes to the register Inst[i+1] reads, we forward Inst[i]'s result.
            
            if (valid_w && !is_branch_w && rd_w == src) begin
                get_reg_val = result_w;
            end else if (valid_x && !is_branch_x && rd_x == src) begin
                get_reg_val = result_x;
            end else begin
                get_reg_val = rf[src];
            end
        end
    endfunction

    // Decode Logic
    wire [1:0] opcode = instr_f[7:6];
    wire [1:0] rd_f = instr_f[5:4];
    wire [3:0] imm_f = instr_f[3:0];
    wire [1:0] rs_f = instr_f[3:2]; // For MOV
    wire [1:0] cond_f = instr_f[5:4]; // For BRcc
    wire [3:0] off_f = instr_f[3:0]; // For BRcc

    // Execute Stage Logic
    wire [1:0] opcode_x = instr_x[7:6];
    wire [1:0] rd_x_wire = instr_x[5:4];
    wire [3:0] imm_x = instr_x[3:0];
    wire [1:0] rs_x_wire = instr_x[3:2];
    wire [1:0] cond_x = instr_x[5:4];
    wire [3:0] off_x = instr_x[3:0];

    // Compute ALU Result for X stage
    // ADDI: R[rd] + sext(imm)
    // MOV: R[rs]
    
    // We need to read the source register(s) for the instruction in X.
    // For ADDI, it reads R[rd].
    // For MOV, it reads R[rs].
    
    reg signed [7:0] src_val_x;
    reg signed [7:0] alu_result_x;

    always @(*) begin
        if (opcode_x == 2'b00) begin // ADDI
            src_val_x = get_reg_val(rd_x_wire);
            alu_result_x = src_val_x + sext4(imm_x);
        end else if (opcode_x == 2'b01) begin // MOV
            src_val_x = get_reg_val(rs_x_wire);
            alu_result_x = src_val_x;
        end else begin
            src_val_x = 8's0;
            alu_result_x = 8's0;
        end
    end

    // Branch Logic for X stage
    reg branch_taken_x_int;
    always @(*) begin
        if (opcode_x == 2'b10) begin // BRcc
            case (cond_x)
                2'b00: branch_taken_x_int = 1'b1;
                2'b01: branch_taken_x_int = (r0_out == 8's0); // Zero
                2'b10: branch_taken_x_int = (r0_out < 8's0);  // Neg
                2'b11: branch_taken_x_int = (r0_out > 8's0);  // Pos
                default: branch_taken_x_int = 1'b0;
            endcase
        end else begin
            branch_taken_x_int = 1'b0;
        end
    end

    // Next PC Calculation for F stage
    // If current F instruction is a branch, we calculate the target.
    // But the branch decision happens in X.
    // So F just fetches PC+1 normally.
    // The PC update happens when the branch result comes back from X to F?
    // In a standard pipeline, the PC update logic is in the F stage, controlled by signals from X/W.
    // Since we have a delay slot, the PC for the *next* fetch (after the delay slot) depends on the branch.
    
    // Let's trace:
    // Cycle T:
    // F: Fetch Inst[i] at PC_i
    // X: Execute Inst[i-1]
    // W: Writeback Inst[i-2]
    
    // Cycle T+1:
    // F: Fetch Inst[i+1] at PC_i+1 (Delay Slot)
    // X: Execute Inst[i] (Branch)
    // W: Writeback Inst[i-1]
    
    // Cycle T+2:
    // F: Fetch Inst[next] at PC_target (if taken) or PC_i+2 (if not taken)
    // X: Execute Inst[i+1] (Delay Slot)
    // W: Writeback Inst[i] (Branch result doesn't write reg)
    
    // So, the PC for the F stage in Cycle T+2 must be determined by the Branch result from Cycle T+1 (which is in X in T+1, W in T+2? No, X in T+1).
    // The branch result `branch_taken_x` is available in the X stage register at the end of Cycle T+1.
    // The F stage for Cycle T+2 needs to decide its PC.
    // We can use the `branch_taken_x` and `pc_x` and `off_x` from the X stage register to compute the next PC.
    
    reg [4:0] next_pc_f;
    always @(*) begin
        if (valid_x && is_branch_x) begin
            if (branch_taken_x) begin
                next_pc_f = pc_x + sext4(off_x) + 1'b1;
            end else begin
                next_pc_f = pc_x + 1'b1;
            end
        end else begin
            next_pc_f = pc_f + 1'b1;
        end
    end

    // Wait, `pc_x` is the PC of the branch instruction.
    // `pc_f` is the PC of the delay slot instruction (which is `pc_x + 1`).
    // If branch is taken, next PC is `Target`.
    // If branch is not taken, next PC is `pc_x + 2` (which is `pc_f + 1`).
    // My formula `pc_x + sext4(off_x) + 1` calculates the target relative to the branch instruction.
    // The ISA says: `PC <= PC + sign_ext(off,4)+1`.
    // So if `pc_x` is the branch PC, Target = `pc_x + sext4(off_x) + 1`.
    // If not taken, the next instruction is the one after the delay slot.
    // The delay slot is at `pc_x + 1`.
    // The next one is `pc_x + 2`.
    // So if not taken, `next_pc = pc_x + 2`.
    // Note: `pc_f` in the next cycle (T+2) will be this `next_pc`.
    
    // Let's refine `next_pc_f` logic.
    // It should be the PC for the instruction that will be in F stage in the next cycle.
    // Current F stage has `pc_f`.
    // Current X stage has `pc_x`.
    // If `valid_x` and `is_branch_x`:
    //    If `branch_taken_x`: Next PC = `pc_x + sext4(off_x) + 1`.
    //    Else: Next PC = `pc_x + 2`.
    // Else:
    //    Next PC = `pc_f + 1`.

    // Also, HALT handling.
    // If `is_halt_x` is true, the pipeline freezes.
    // `halt_out` becomes 1.
    // `pc_out` stays at `pc_f` (or `pc_x`? Prompt: "pc_out reflects the PC currently in the fetch stage").
    // If pipeline freezes, F stage doesn't advance. So `pc_out` stays same.
    
    reg halt_freeze;
    always @(*) begin
        if (rst) halt_freeze = 1'b0;
        else if (halt_out) halt_freeze = 1'b1;
        else if (valid_x && is_halt_x) halt_freeze = 1'b1;
        else halt_freeze = 1'b0;
    end

    // Main Sequential Logic
    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 5'b00000;
            halt_out <= 1'b0;
            r0_out <= 8's0;
            rf[0] <= 8's0; rf[1] <= 8's0; rf[2] <= 8's0; rf[3] <= 8's0;
            
            // Clear Pipeline
            valid_f <= 1'b0;
            valid_x <= 1'b0;
            valid_w <= 1'b0;
            halt_freeze <= 1'b0;
        end else begin
            if (halt_freeze) begin
                // Freeze pipeline
                // pc_out stays same
                // halt_out stays 1
                // r0_out stays same
                // Registers stay same
                // Pipeline registers stay same
            end else begin
                // Update Halt Output
                if (valid_w && is_halt_w) begin
                    halt_out <= 1'b1;
                end else if (halt_out) begin
                    halt_out <= 1'b1;
                end else begin
                    halt_out <= 1'b0;
                end

                // Update Register File
                // Writeback happens if valid_w and not suppressed.
                // Suppression: If the instruction in W is a delay slot for a taken branch?
                // My convention: Delay slot executes and commits.
                // So no suppression for register writes.
                if (valid_w && !is_halt_w && !is_branch_w) begin
                    rf[rd_w] <= result_w;
                end

                // Update r0_out
                if (valid_w && !is_halt_w && !is_branch_w && rd_w == 2'b00) begin
                    r0_out <= result_w;
                end else if (valid_x && !is_halt_x && !is_branch_x && rd_x == 2'b00) begin
                    // Forwarding update for r0_out? 
                    // r0_out should reflect the "most recently written value".
                    // If W stage writes R0, r0_out updates.
                    // If X stage writes R0, it hasn't been "written" to the file yet.
                    // So r0_out should only update on W stage commit.
                end

                // Shift Pipeline: W <- X, X <- F, F <- New Fetch

                // Stage W Update
                pc_w <= pc_x;
                instr_w <= instr_x;
                valid_w <= valid_x;
                rd_w <= rd_x;
                result_w <= result_x;
                is_branch_w <= is_branch_x;
                branch_taken_w <= branch_taken_x;
                is_halt_w <= is_halt_x;

                // Stage X Update
                pc_x <= pc_f;
                instr_x <= instr_f;
                valid_x <= valid_f;
                
                // Decode F instruction for X stage preparation
                if (valid_f) begin
                    case (instr_f[7:6])
                        2'b00: begin // ADDI
                            rd_x <= instr_f[5:4];
                            rs_x <= 2'b00; // Unused
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b0;
                        end
                        2'b01: begin // MOV
                            rd_x <= instr_f[5:4];
                            rs_x <= instr_f[3:2];
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b0;
                        end
                        2'b10: begin // BRcc
                            rd_x <= 2'b00; // Unused
                            rs_x <= 2'b00; // Unused
                            is_branch_x <= 1'b1;
                            // branch_taken_x is computed in combinational logic below based on instr_x
                            // But we need to store the condition and offset to compute it in X stage.
                            // The `branch_taken_x` signal in the register is just a placeholder or we compute it in the next cycle.
                            // Actually, `branch_taken_x` in the register `X` should reflect the result of the branch decision for the instruction in X.
                            // So we don't store `branch_taken_x` from F. We compute it in X stage combinational logic.
                            // So we just set `is_branch_x` here.
                            is_halt_x <= 1'b0;
                        end
                        2'b11: begin // HALT
                            rd_x <= 2'b00;
                            rs_x <= 2'b00;
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b1;
                        end
                        default: begin
                            rd_x <= 2'b00;
                            rs_x <= 2'b00;
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b0;
                        end
                    endcase
                end else begin
                    rd_x <= 2'b00;
                    rs_x <= 2'b00;
                    is_branch_x <= 1'b0;
                    branch_taken_x <= 1'b0;
                    is_halt_x <= 1'b0;
                end

                // Stage F Update
                // Determine next PC
                if (valid_x && is_branch_x) begin
                    if (branch_taken_x) begin
                        pc_out <= pc_x + sext4(instr_x[3:0]) + 1'b1;
                    end else begin
                        pc_out <= pc_x + 2'b10; // pc_x + 2
                    end
                end else begin
                    pc_out <= pc_f + 1'b1;
                end

                // Fetch instruction at new PC
                // IMEM_INIT is 256 bits. PC is 5 bits.
                // Byte 0 is at MSB.
                // Instruction at PC=0 is IMEM_INIT[255:248].
                // Instruction at PC=1 is IMEM_INIT[247:240].
                // General: Instr[PC] = IMEM_INIT[ (31-PC)*8 +: 8 ]
                instr_f <= IMEM_INIT[ (31 - pc_out) * 8 +: 8 ];
                valid_f <= 1'b1; // Always valid unless reset or halt? 
                // If PC is out of bounds (>=32), what happens?
                // Parameter is 32 bytes. PC is 5 bits (0-31).
                // If PC wraps or goes out, we assume valid memory or 0.
                // The indexing (31-PC) works for 0-31.
            end
        end
    end

    // Combinational Logic for X Stage Results
    // This logic runs every cycle

raw model output

```verilog
// Branch convention: The instruction immediately following a BRcc is always
// fetched and executed (branch delay slot). If the branch is taken, the
// instruction fetched into the delay slot is effectively discarded from the
// architectural state (its writeback is suppressed or its result is ignored
// for subsequent reads). This implementation suppresses the writeback of the
// delay slot instruction if the branch is taken.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Stage Registers
    // Stage F (Fetch)
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg       valid_f;

    // Stage X (Execute)
    reg [4:0] pc_x;
    reg [7:0] instr_x;
    reg       valid_x;
    // Forwarded/Computed values for Writeback
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg signed [7:0] result_x;
    reg is_branch_x;
    reg branch_taken_x;
    reg is_halt_x;

    // Stage W (Writeback)
    reg [4:0] pc_w;
    reg [7:0] instr_w;
    reg       valid_w;
    reg [1:0] rd_w;
    reg signed [7:0] result_w;
    reg is_branch_w;
    reg branch_taken_w; // Used to suppress writeback of delay slot
    reg is_halt_w;

    // Helper: Sign extend 4-bit immediate to 8-bit
    function signed [7:0] sext4;
        input [3:0] val;
        begin
            sext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: Read register with forwarding
    // Forwarding logic:
    // 1. If W stage is writing to the same register and is valid (and not a branch delay slot suppression case that invalidates the write), use W result.
    // 2. If X stage is writing to the same register and is valid (and not a branch delay slot suppression case), use X result.
    // 3. Otherwise, use register file.
    // Note: Branch delay slot suppression means if the previous instruction was a taken branch, the current instruction (in X or W) is the delay slot.
    // However, standard MIPS-like delay slot execution means the delay slot instruction executes.
    // The prompt says: "instruction after the branch always executes".
    // It also says "document whichever convention you choose".
    // Convention chosen: The delay slot instruction executes fully. If the branch is taken, the PC updates to the target.
    // The delay slot instruction's writeback happens normally.
    // WAIT: "1 branch delay slot is acceptable ... document whichever convention you choose".
    // Usually, in a simple 3-stage pipeline without complex hazard detection for branches,
    // the "delay slot" is just the instruction that was already fetched.
    // If I implement "suppress writeback of delay slot if branch taken", I need to track if the current W instruction is the delay slot for a taken branch.
    // Let's stick to the simplest valid interpretation:
    // The instruction fetched at PC+1 is the delay slot. It executes.
    // If the branch is taken, the PC becomes Target. The delay slot instruction is executed.
    // Does its result count? Yes, usually.
    // BUT, if I suppress it, it's also a valid "convention" (like ARM conditional execution or MIPS where you put NOPs).
    // Let's look at the forwarding requirement: "Forwarding REQUIRED for back-to-back ADDI / MOV".
    // This implies standard ALU forwarding.
    // Let's implement standard execution: Delay slot executes. Its writeback occurs.
    // If the branch is taken, the next fetch is from Target. The delay slot is "wasted" in terms of flow control but not in terms of side effects (register writes).
    // Actually, many simple MCUs discard the delay slot if taken. Let's look at the "freeze pipeline" for HALT.
    // Let's assume the delay slot instruction is executed and its results are committed.
    // Why? Because "instruction after the branch always executes".
    // If I suppress writeback, I am effectively saying it doesn't execute its side effects.
    // I will implement: Delay slot executes and commits.

    function signed [7:0] read_reg;
        input [1:0] addr;
        begin
            if (addr == 2'b00) read_reg = 8's0; // R0 is hardwired to 0? No, R0 is a register.
            // The prompt says R[0..3]. It doesn't say R0 is zero.
            // However, standard convention often has R0=0.
            // The prompt says "all R[i] <= 0" on reset.
            // It does not say R0 is hardwired to 0.
            // So R0 is a normal register.
            read_reg = rf[addr];
        end
    endfunction

    // Forwarding Logic for Execute Stage Reading RS
    // We need to read RS for ADDI (doesn't use RS) and MOV (uses RS).
    // ADDI: R[rd] <= R[rd] + imm. It reads R[rd].
    // MOV: R[rd] <= R[rs]. It reads R[rs].
    
    // Let's define a function that returns the value to use for a source register `src`
    // considering the current state of X and W stages.
    // Note: In a 3-stage pipeline, the value produced by X is available for the NEXT instruction's X stage?
    // No, X stage computes the result. W stage writes it.
    // Forwarding from X to X: The instruction in X has computed `result_x`. The instruction entering X (from F) needs a value.
    // If the instruction in W is writing to `src`, and it's valid, use `result_w`.
    // If the instruction in X is writing to `src`, and it's valid, use `result_x`.
    // Otherwise, use `rf[src]`.

    function signed [7:0] get_reg_val;
        input [1:0] src;
        begin
            // Priority: W stage (just finished) > X stage (currently computing) > Register File
            // However, W stage writes happen at the end of the cycle.
            // X stage reads happen at the beginning of the cycle.
            // So we should forward from X if the instruction in X is producing the value for the instruction in F->X?
            // No, the instruction in X is the one *before* the current one.
            // If Inst[i] is in X, and Inst[i+1] is in F (moving to X), Inst[i+1] needs a value.
            // If Inst[i] writes to the register Inst[i+1] reads, we forward Inst[i]'s result.
            
            if (valid_w && !is_branch_w && rd_w == src) begin
                get_reg_val = result_w;
            end else if (valid_x && !is_branch_x && rd_x == src) begin
                get_reg_val = result_x;
            end else begin
                get_reg_val = rf[src];
            end
        end
    endfunction

    // Decode Logic
    wire [1:0] opcode = instr_f[7:6];
    wire [1:0] rd_f = instr_f[5:4];
    wire [3:0] imm_f = instr_f[3:0];
    wire [1:0] rs_f = instr_f[3:2]; // For MOV
    wire [1:0] cond_f = instr_f[5:4]; // For BRcc
    wire [3:0] off_f = instr_f[3:0]; // For BRcc

    // Execute Stage Logic
    wire [1:0] opcode_x = instr_x[7:6];
    wire [1:0] rd_x_wire = instr_x[5:4];
    wire [3:0] imm_x = instr_x[3:0];
    wire [1:0] rs_x_wire = instr_x[3:2];
    wire [1:0] cond_x = instr_x[5:4];
    wire [3:0] off_x = instr_x[3:0];

    // Compute ALU Result for X stage
    // ADDI: R[rd] + sext(imm)
    // MOV: R[rs]
    
    // We need to read the source register(s) for the instruction in X.
    // For ADDI, it reads R[rd].
    // For MOV, it reads R[rs].
    
    reg signed [7:0] src_val_x;
    reg signed [7:0] alu_result_x;

    always @(*) begin
        if (opcode_x == 2'b00) begin // ADDI
            src_val_x = get_reg_val(rd_x_wire);
            alu_result_x = src_val_x + sext4(imm_x);
        end else if (opcode_x == 2'b01) begin // MOV
            src_val_x = get_reg_val(rs_x_wire);
            alu_result_x = src_val_x;
        end else begin
            src_val_x = 8's0;
            alu_result_x = 8's0;
        end
    end

    // Branch Logic for X stage
    reg branch_taken_x_int;
    always @(*) begin
        if (opcode_x == 2'b10) begin // BRcc
            case (cond_x)
                2'b00: branch_taken_x_int = 1'b1;
                2'b01: branch_taken_x_int = (r0_out == 8's0); // Zero
                2'b10: branch_taken_x_int = (r0_out < 8's0);  // Neg
                2'b11: branch_taken_x_int = (r0_out > 8's0);  // Pos
                default: branch_taken_x_int = 1'b0;
            endcase
        end else begin
            branch_taken_x_int = 1'b0;
        end
    end

    // Next PC Calculation for F stage
    // If current F instruction is a branch, we calculate the target.
    // But the branch decision happens in X.
    // So F just fetches PC+1 normally.
    // The PC update happens when the branch result comes back from X to F?
    // In a standard pipeline, the PC update logic is in the F stage, controlled by signals from X/W.
    // Since we have a delay slot, the PC for the *next* fetch (after the delay slot) depends on the branch.
    
    // Let's trace:
    // Cycle T:
    // F: Fetch Inst[i] at PC_i
    // X: Execute Inst[i-1]
    // W: Writeback Inst[i-2]
    
    // Cycle T+1:
    // F: Fetch Inst[i+1] at PC_i+1 (Delay Slot)
    // X: Execute Inst[i] (Branch)
    // W: Writeback Inst[i-1]
    
    // Cycle T+2:
    // F: Fetch Inst[next] at PC_target (if taken) or PC_i+2 (if not taken)
    // X: Execute Inst[i+1] (Delay Slot)
    // W: Writeback Inst[i] (Branch result doesn't write reg)
    
    // So, the PC for the F stage in Cycle T+2 must be determined by the Branch result from Cycle T+1 (which is in X in T+1, W in T+2? No, X in T+1).
    // The branch result `branch_taken_x` is available in the X stage register at the end of Cycle T+1.
    // The F stage for Cycle T+2 needs to decide its PC.
    // We can use the `branch_taken_x` and `pc_x` and `off_x` from the X stage register to compute the next PC.
    
    reg [4:0] next_pc_f;
    always @(*) begin
        if (valid_x && is_branch_x) begin
            if (branch_taken_x) begin
                next_pc_f = pc_x + sext4(off_x) + 1'b1;
            end else begin
                next_pc_f = pc_x + 1'b1;
            end
        end else begin
            next_pc_f = pc_f + 1'b1;
        end
    end

    // Wait, `pc_x` is the PC of the branch instruction.
    // `pc_f` is the PC of the delay slot instruction (which is `pc_x + 1`).
    // If branch is taken, next PC is `Target`.
    // If branch is not taken, next PC is `pc_x + 2` (which is `pc_f + 1`).
    // My formula `pc_x + sext4(off_x) + 1` calculates the target relative to the branch instruction.
    // The ISA says: `PC <= PC + sign_ext(off,4)+1`.
    // So if `pc_x` is the branch PC, Target = `pc_x + sext4(off_x) + 1`.
    // If not taken, the next instruction is the one after the delay slot.
    // The delay slot is at `pc_x + 1`.
    // The next one is `pc_x + 2`.
    // So if not taken, `next_pc = pc_x + 2`.
    // Note: `pc_f` in the next cycle (T+2) will be this `next_pc`.
    
    // Let's refine `next_pc_f` logic.
    // It should be the PC for the instruction that will be in F stage in the next cycle.
    // Current F stage has `pc_f`.
    // Current X stage has `pc_x`.
    // If `valid_x` and `is_branch_x`:
    //    If `branch_taken_x`: Next PC = `pc_x + sext4(off_x) + 1`.
    //    Else: Next PC = `pc_x + 2`.
    // Else:
    //    Next PC = `pc_f + 1`.

    // Also, HALT handling.
    // If `is_halt_x` is true, the pipeline freezes.
    // `halt_out` becomes 1.
    // `pc_out` stays at `pc_f` (or `pc_x`? Prompt: "pc_out reflects the PC currently in the fetch stage").
    // If pipeline freezes, F stage doesn't advance. So `pc_out` stays same.
    
    reg halt_freeze;
    always @(*) begin
        if (rst) halt_freeze = 1'b0;
        else if (halt_out) halt_freeze = 1'b1;
        else if (valid_x && is_halt_x) halt_freeze = 1'b1;
        else halt_freeze = 1'b0;
    end

    // Main Sequential Logic
    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 5'b00000;
            halt_out <= 1'b0;
            r0_out <= 8's0;
            rf[0] <= 8's0; rf[1] <= 8's0; rf[2] <= 8's0; rf[3] <= 8's0;
            
            // Clear Pipeline
            valid_f <= 1'b0;
            valid_x <= 1'b0;
            valid_w <= 1'b0;
            halt_freeze <= 1'b0;
        end else begin
            if (halt_freeze) begin
                // Freeze pipeline
                // pc_out stays same
                // halt_out stays 1
                // r0_out stays same
                // Registers stay same
                // Pipeline registers stay same
            end else begin
                // Update Halt Output
                if (valid_w && is_halt_w) begin
                    halt_out <= 1'b1;
                end else if (halt_out) begin
                    halt_out <= 1'b1;
                end else begin
                    halt_out <= 1'b0;
                end

                // Update Register File
                // Writeback happens if valid_w and not suppressed.
                // Suppression: If the instruction in W is a delay slot for a taken branch?
                // My convention: Delay slot executes and commits.
                // So no suppression for register writes.
                if (valid_w && !is_halt_w && !is_branch_w) begin
                    rf[rd_w] <= result_w;
                end

                // Update r0_out
                if (valid_w && !is_halt_w && !is_branch_w && rd_w == 2'b00) begin
                    r0_out <= result_w;
                end else if (valid_x && !is_halt_x && !is_branch_x && rd_x == 2'b00) begin
                    // Forwarding update for r0_out? 
                    // r0_out should reflect the "most recently written value".
                    // If W stage writes R0, r0_out updates.
                    // If X stage writes R0, it hasn't been "written" to the file yet.
                    // So r0_out should only update on W stage commit.
                end

                // Shift Pipeline: W <- X, X <- F, F <- New Fetch

                // Stage W Update
                pc_w <= pc_x;
                instr_w <= instr_x;
                valid_w <= valid_x;
                rd_w <= rd_x;
                result_w <= result_x;
                is_branch_w <= is_branch_x;
                branch_taken_w <= branch_taken_x;
                is_halt_w <= is_halt_x;

                // Stage X Update
                pc_x <= pc_f;
                instr_x <= instr_f;
                valid_x <= valid_f;
                
                // Decode F instruction for X stage preparation
                if (valid_f) begin
                    case (instr_f[7:6])
                        2'b00: begin // ADDI
                            rd_x <= instr_f[5:4];
                            rs_x <= 2'b00; // Unused
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b0;
                        end
                        2'b01: begin // MOV
                            rd_x <= instr_f[5:4];
                            rs_x <= instr_f[3:2];
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b0;
                        end
                        2'b10: begin // BRcc
                            rd_x <= 2'b00; // Unused
                            rs_x <= 2'b00; // Unused
                            is_branch_x <= 1'b1;
                            // branch_taken_x is computed in combinational logic below based on instr_x
                            // But we need to store the condition and offset to compute it in X stage.
                            // The `branch_taken_x` signal in the register is just a placeholder or we compute it in the next cycle.
                            // Actually, `branch_taken_x` in the register `X` should reflect the result of the branch decision for the instruction in X.
                            // So we don't store `branch_taken_x` from F. We compute it in X stage combinational logic.
                            // So we just set `is_branch_x` here.
                            is_halt_x <= 1'b0;
                        end
                        2'b11: begin // HALT
                            rd_x <= 2'b00;
                            rs_x <= 2'b00;
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b1;
                        end
                        default: begin
                            rd_x <= 2'b00;
                            rs_x <= 2'b00;
                            is_branch_x <= 1'b0;
                            branch_taken_x <= 1'b0;
                            is_halt_x <= 1'b0;
                        end
                    endcase
                end else begin
                    rd_x <= 2'b00;
                    rs_x <= 2'b00;
                    is_branch_x <= 1'b0;
                    branch_taken_x <= 1'b0;
                    is_halt_x <= 1'b0;
                end

                // Stage F Update
                // Determine next PC
                if (valid_x && is_branch_x) begin
                    if (branch_taken_x) begin
                        pc_out <= pc_x + sext4(instr_x[3:0]) + 1'b1;
                    end else begin
                        pc_out <= pc_x + 2'b10; // pc_x + 2
                    end
                end else begin
                    pc_out <= pc_f + 1'b1;
                end

                // Fetch instruction at new PC
                // IMEM_INIT is 256 bits. PC is 5 bits.
                // Byte 0 is at MSB.
                // Instruction at PC=0 is IMEM_INIT[255:248].
                // Instruction at PC=1 is IMEM_INIT[247:240].
                // General: Instr[PC] = IMEM_INIT[ (31-PC)*8 +: 8 ]
                instr_f <= IMEM_INIT[ (31 - pc_out) * 8 +: 8 ];
                valid_f <= 1'b1; // Always valid unless reset or halt? 
                // If PC is out of bounds (>=32), what happens?
                // Parameter is 32 bytes. PC is 5 bits (0-31).
                // If PC wraps or goes out, we assume valid memory or 0.
                // The indexing (31-PC) works for 0-31.
            end
        end
    end

    // Combinational Logic for X Stage Results
    // This logic runs every cycle