sft_2

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register file: 4 registers, 8-bit signed
    reg signed [7:0] R [0:3];

    // Pipeline registers
    // Stage F -> X
    reg [7:0]  instr_x;
    reg [4:0]  pc_x;
    reg        valid_x;

    // Stage X -> W
    reg [7:0]  instr_w;
    reg [4:0]  pc_w;
    reg        valid_w;
    reg        is_halt_w;
    reg        is_reg_write_w;
    reg [1:0]  rd_w;
    reg signed [7:0] result_w;

    // Forwarding registers (holding X stage results for W->X forwarding)
    reg        is_reg_write_x;
    reg [1:0]  rd_x;
    reg signed [7:0] result_x;

    // Next PC logic for F stage
    reg [4:0]  npc_f;
    reg        take_branch_f;

    // Decode/Execute logic
    reg        is_halt_x;
    reg        is_branch_x;
    reg        is_reg_write_x_int;
    reg [1:0]  rd_x_int;
    reg [1:0]  rs_x_int;
    reg signed [7:0] result_x_int;
    reg [3:0]  imm_x;
    reg [1:0]  cond_x;
    reg [3:0]  off_x;
    reg        branch_taken_x;

    // Forwarding mux selects for X stage
    reg signed [7:0] rs_fwd;

    // Helper: sign extend 4-bit to 8-bit
    function signed [7:0] sext4;
        input [3:0] val;
        begin
            sext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: sign extend 4-bit offset to 5-bit for PC addition
    function signed [4:0] sext4_off;
        input [3:0] val;
        begin
            sext4_off = {{4{val[3]}}, val};
        end
    endfunction

    // IMEM access: byte-addressed, 32 bytes. IMEM_INIT is 256-bit.
    // Byte 0 is at bits [255:248], Byte 1 at [247:240], etc.
    // PC is 5-bit, 0..31.
    // Instruction at PC i is IMEM_INIT[(31-i)*8 +: 8] ? 
    // Let's check: "byte 0 at MSB". So byte 0 is MSB of the 256-bit value.
    // In Verilog, if we index IMEM_INIT, bit 255 is the MSB.
    // Byte 0: bits [255:248]
    // Byte 1: bits [247:240]
    // Byte k: bits [255-k*8 : 248-k*8]
    // So for PC=p, instruction = IMEM_INIT[255-p*8 -: 8]
    wire [7:0] imem_data = IMEM_INIT[255 - pc_out*8 -: 8];

    // F Stage: Fetch instruction and compute next PC
    // The next PC depends on the branch decision from the W stage (for the instruction at pc_w)
    // But wait, the branch decision is made in X stage, and the result is in W stage.
    // Actually, for the F stage, we need to know if the instruction in W stage is a taken branch.
    // If so, the F stage should fetch from the branch target instead of PC+1.
    // However, the problem says "1 branch delay slot is acceptable".
    // Standard 3-stage pipeline with delay slot:
    // F: Fetch instr at PC
    // X: Decode instr at PC-1
    // W: Writeback instr at PC-2
    // Branch resolution happens in X stage. The instruction in the delay slot (fetched by F when branch was in X) is executed.
    // If branch taken, the next instruction after the delay slot is the target.
    // So, when the branch instruction is in X stage, the F stage fetches the delay slot instruction (PC+1).
    // When the branch instruction moves to W stage, if taken, the F stage should fetch the target.
    // But the F stage is fetching for the next cycle. The PC for the next cycle's F stage is determined by the W stage's branch result.
    
    // Let's define the pipeline state more carefully.
    // Cycle N:
    // F: Fetch instr at PC_F
    // X: Execute instr at PC_X (which was fetched in cycle N-1)
    // W: Writeback instr at PC_W (which was executed in cycle N-2)
    
    // Branch instruction BRcc is at PC_B.
    // Cycle T: BRcc in X stage. F stage fetches PC_B+1 (delay slot).
    // Cycle T+1: BRcc in W stage. Delay slot instr in X stage. F stage fetches next instr.
    // If BRcc taken, next instr should be target. If not taken, next instr is PC_B+2.
    // So, the F stage in cycle T+2 should fetch from target or PC_B+2 based on W stage result of BRcc.
    
    // Implementation:
    // In W stage, if is_halt_w, halt.
    // If is_branch_w and branch_taken_w, then next PC for F stage is branch_target.
    // Else, next PC is PC_W + 2 (because PC_W+1 was delay slot, PC_W+2 is next).
    // Wait, PC_W is the PC of the instruction in W stage.
    // The instruction in X stage has PC_X = PC_W + 1.
    // The instruction in F stage has PC_F = PC_X + 1 = PC_W + 2.
    // If the instruction in W stage is a taken branch, the instruction that should be fetched next (after the delay slot) is the target.
    // But the delay slot instruction is already in X stage. It will execute.
    // The instruction fetched in F stage (PC_W+2) will go to X stage next cycle.
    // If branch taken, we want the instruction after the delay slot to be the target.
    // So, if branch taken, the F stage should fetch from target.
    // But the F stage is currently fetching PC_W+2. We need to override it.
    
    // Let's use a signal from W stage to override the PC increment.
    reg        branch_taken_w;
    reg [4:0]  branch_target_w;

    // F Stage Logic
    always @(*) begin
        if (halt_out) begin
            npc_f = pc_out; // Freeze PC
            take_branch_f = 0;
        end else if (valid_w && is_branch_w && branch_taken_w) begin
            npc_f = branch_target_w;
            take_branch_f = 1;
        end else begin
            npc_f = pc_out + 1;
            take_branch_f = 0;
        end
    end

    // X Stage Logic
    always @(*) begin
        // Default
        is_halt_x = 0;
        is_branch_x = 0;
        is_reg_write_x_int = 0;
        rd_x_int = 2'b00;
        rs_x_int = 2'b00;
        result_x_int = 8'sb0;
        imm_x = 4'b0000;
        cond_x = 2'b00;
        off_x = 4'b0000;
        branch_taken_x = 0;
        rs_fwd = 8'sb0;

        if (!valid_x) begin
            // No instruction
        end else begin
            case (instr_x[7:6])
                2'b00: begin // ADDI
                    is_reg_write_x_int = 1;
                    rd_x_int = instr_x[5:4];
                    imm_x = instr_x[3:0];
                    // R[rd] = R[rd] + sign_ext(imm)
                    // Forwarding for R[rd] (which is the same as rs for ADDI? No, ADDI only writes rd, reads nothing from R)
                    // ADDI reads nothing from R, so no forwarding needed for rs.
                    result_x_int = R[rd_x_int] + sext4(imm_x);
                end
                2'b01: begin // MOV
                    is_reg_write_x_int = 1;
                    rd_x_int = instr_x[5:4];
                    rs_x_int = instr_x[3:2];
                    // R[rd] = R[rs]
                    // Forwarding for R[rs]
                    rs_fwd = R[rs_x_int]; // Default
                    // Check forwarding from W stage
                    if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
                        rs_fwd = result_w;
                    end
                    // Check forwarding from X stage (self-forwarding? No, rs and rd are different usually, but if same, use current R)
                    // If rd_x == rs_x, we should use the current value from R, not the result of the same instruction.
                    // But in a pipeline, the X stage result is not yet committed. So we use R[rs].
                    // However, if the previous instruction (in W) wrote to rs, we forward from W.
                    // If the previous instruction (in X) wrote to rs, we forward from X.
                    if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
                        rs_fwd = result_w;
                    end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                        // Self-forward: use current R value
                        rs_fwd = R[rs_x_int];
                    end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                         // This case is covered above.
                    end else begin
                        // Check X stage forwarding for rs
                        if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                            rs_fwd = R[rs_x_int]; // Use current R if rd==rs
                        end else begin
                            rs_fwd = R[rs_x_int];
                        end
                    end
                    // Re-evaluate forwarding logic more cleanly:
                    // Priority: W stage > X stage > Register File
                    if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
                        rs_fwd = result_w;
                    end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                        // If the current instruction writes to rd and reads from rs, and rd==rs,
                        // we should use the current value from R, not the result of the current instruction.
                        rs_fwd = R[rs_x_int];
                    end else begin
                        rs_fwd = R[rs_x_int];
                    end
                    result_x_int = rs_fwd;
                end
                2'b10: begin // BRcc
                    is_branch_x = 1;
                    cond_x = instr_x[5:4];
                    off_x = instr_x[3:0];
                    // Branch condition
                    case (cond_x)
                        2'b00: branch_taken_x = 1; // Always
                        2'b01: branch_taken_x = (R[0] == 8'sb0); // Zero
                        2'b10: branch_taken_x = (R[0] < 8'sb0); // Negative
                        2'b11: branch_taken_x = (R[0] > 8'sb0); // Positive
                        default: branch_taken_x = 0;
                    endcase
                    // Compute branch target: PC + sign_ext(off) + 1
                    // PC for this instruction is pc_x
                    result_x_int = pc_x + sext4_off(off_x) + 1;
                end
                2'b11: begin // HALT
                    is_halt_x = 1;
                end
                default: begin
                    // NOP or invalid
                end
            endcase
        end
    end

    // W Stage Logic
    always @(*) begin
        is_halt_w = 0;
        is_branch_w = 0;
        is_reg_write_w = 0;
        rd_w = 2'b00;
        result_w = 8'sb0;
        branch_taken_w = 0;
        branch_target_w = 5'b00000;

        if (!valid_w) begin
            // No instruction
        end else begin
            if (is_halt_x) begin
                is_halt_w = 1;
            end else if (is_branch_x) begin
                is_branch_w = 1;
                branch_taken_w = branch_taken_x;
                branch_target_w = result_x_int; // This is the target PC
            end else if (is_reg_write_x_int) begin
                is_reg_write_w = 1;
                rd_w = rd_x_int;
                result_w = result_x_int;
            end
        end
    end

    // Pipeline Register Updates
    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 5'b00000;
            halt_out <= 1'b0;
            r0_out <= 8'sb0;
            for (int i = 0; i < 4; i++) R[i] <= 8'sb0;
            valid_x <= 1'b0;
            valid_w <= 1'b0;
            instr_x <= 8'b0;
            pc_x <= 5'b0;
            instr_w <= 8'b0;
            pc_w <= 5'b0;
            is_halt_w <= 1'b0;
            is_reg_write_w <= 1'b0;
            rd_w <= 2'b0;
            result_w <= 8'sb0;
            is_reg_write_x <= 1'b0;
            rd_x <= 2'b0;
            result_x <= 8'sb0;
        end else if (halt_out) begin
            // Freeze everything
            pc_out <= pc_out;
            halt_out <= 1'b1;
            r0_out <= r0_out;
            // R remains unchanged
            valid_x <= 1'b0;
            valid_w <= 1'b0;
            // Keep other regs stable
        end else begin
            // Update W stage
            valid_w <= valid_x;
            instr_w <= instr_x;
            pc_w <= pc_x;
            is_halt_w <= is_halt_x;
            is_reg_write_w <= is_reg_write_x_int;
            rd_w <= rd_x_int;
            result_w <= result_x_int;
            branch_taken_w <= branch_taken_x;
            branch_target_w <= result_x_int; // For branch, result_x_int is target

            // Update X stage
            valid_x <= valid_f; // valid_f is always 1 unless halt
            instr_x <= imem_data;
            pc_x <= npc_f;

            // Update F stage PC
            pc_out <= npc_f;

            // Update Register File
            if (valid_w && is_reg_write_w) begin
                R[rd_w] <= result_w;
            end

            // Update r0_out
            if (valid_w && is_reg_write_w && rd_w == 2'b00) begin
                r0_out <= result_w;
            end

            // Update halt_out
            if (valid_w && is_halt_w) begin
                halt_out <= 1'b1;
            end

            // Update forwarding regs for next cycle's X stage
            is_reg_write_x <= is_reg_write_w;
            rd_x <= rd_w;
            result_x <= result_w;
        end
    end

    // Valid F signal: always valid unless halt
    wire valid_f = ~halt_out;

endmodule

raw model output

```verilog
// Branch convention: 1 branch delay slot. The instruction immediately following
// a BRcc is always fetched and executed. If the branch is taken, the target
// instruction replaces the instruction at PC+2 in the next cycle.
// Forwarding is implemented for W->X and X->X to handle ALU hazards without stalls.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register file: 4 registers, 8-bit signed
    reg signed [7:0] R [0:3];

    // Pipeline registers
    // Stage F -> X
    reg [7:0]  instr_x;
    reg [4:0]  pc_x;
    reg        valid_x;

    // Stage X -> W
    reg [7:0]  instr_w;
    reg [4:0]  pc_w;
    reg        valid_w;
    reg        is_halt_w;
    reg        is_reg_write_w;
    reg [1:0]  rd_w;
    reg signed [7:0] result_w;

    // Forwarding registers (holding X stage results for W->X forwarding)
    reg        is_reg_write_x;
    reg [1:0]  rd_x;
    reg signed [7:0] result_x;

    // Next PC logic for F stage
    reg [4:0]  npc_f;
    reg        take_branch_f;

    // Decode/Execute logic
    reg        is_halt_x;
    reg        is_branch_x;
    reg        is_reg_write_x_int;
    reg [1:0]  rd_x_int;
    reg [1:0]  rs_x_int;
    reg signed [7:0] result_x_int;
    reg [3:0]  imm_x;
    reg [1:0]  cond_x;
    reg [3:0]  off_x;
    reg        branch_taken_x;

    // Forwarding mux selects for X stage
    reg signed [7:0] rs_fwd;

    // Helper: sign extend 4-bit to 8-bit
    function signed [7:0] sext4;
        input [3:0] val;
        begin
            sext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: sign extend 4-bit offset to 5-bit for PC addition
    function signed [4:0] sext4_off;
        input [3:0] val;
        begin
            sext4_off = {{4{val[3]}}, val};
        end
    endfunction

    // IMEM access: byte-addressed, 32 bytes. IMEM_INIT is 256-bit.
    // Byte 0 is at bits [255:248], Byte 1 at [247:240], etc.
    // PC is 5-bit, 0..31.
    // Instruction at PC i is IMEM_INIT[(31-i)*8 +: 8] ? 
    // Let's check: "byte 0 at MSB". So byte 0 is MSB of the 256-bit value.
    // In Verilog, if we index IMEM_INIT, bit 255 is the MSB.
    // Byte 0: bits [255:248]
    // Byte 1: bits [247:240]
    // Byte k: bits [255-k*8 : 248-k*8]
    // So for PC=p, instruction = IMEM_INIT[255-p*8 -: 8]
    wire [7:0] imem_data = IMEM_INIT[255 - pc_out*8 -: 8];

    // F Stage: Fetch instruction and compute next PC
    // The next PC depends on the branch decision from the W stage (for the instruction at pc_w)
    // But wait, the branch decision is made in X stage, and the result is in W stage.
    // Actually, for the F stage, we need to know if the instruction in W stage is a taken branch.
    // If so, the F stage should fetch from the branch target instead of PC+1.
    // However, the problem says "1 branch delay slot is acceptable".
    // Standard 3-stage pipeline with delay slot:
    // F: Fetch instr at PC
    // X: Decode instr at PC-1
    // W: Writeback instr at PC-2
    // Branch resolution happens in X stage. The instruction in the delay slot (fetched by F when branch was in X) is executed.
    // If branch taken, the next instruction after the delay slot is the target.
    // So, when the branch instruction is in X stage, the F stage fetches the delay slot instruction (PC+1).
    // When the branch instruction moves to W stage, if taken, the F stage should fetch the target.
    // But the F stage is fetching for the next cycle. The PC for the next cycle's F stage is determined by the W stage's branch result.
    
    // Let's define the pipeline state more carefully.
    // Cycle N:
    // F: Fetch instr at PC_F
    // X: Execute instr at PC_X (which was fetched in cycle N-1)
    // W: Writeback instr at PC_W (which was executed in cycle N-2)
    
    // Branch instruction BRcc is at PC_B.
    // Cycle T: BRcc in X stage. F stage fetches PC_B+1 (delay slot).
    // Cycle T+1: BRcc in W stage. Delay slot instr in X stage. F stage fetches next instr.
    // If BRcc taken, next instr should be target. If not taken, next instr is PC_B+2.
    // So, the F stage in cycle T+2 should fetch from target or PC_B+2 based on W stage result of BRcc.
    
    // Implementation:
    // In W stage, if is_halt_w, halt.
    // If is_branch_w and branch_taken_w, then next PC for F stage is branch_target.
    // Else, next PC is PC_W + 2 (because PC_W+1 was delay slot, PC_W+2 is next).
    // Wait, PC_W is the PC of the instruction in W stage.
    // The instruction in X stage has PC_X = PC_W + 1.
    // The instruction in F stage has PC_F = PC_X + 1 = PC_W + 2.
    // If the instruction in W stage is a taken branch, the instruction that should be fetched next (after the delay slot) is the target.
    // But the delay slot instruction is already in X stage. It will execute.
    // The instruction fetched in F stage (PC_W+2) will go to X stage next cycle.
    // If branch taken, we want the instruction after the delay slot to be the target.
    // So, if branch taken, the F stage should fetch from target.
    // But the F stage is currently fetching PC_W+2. We need to override it.
    
    // Let's use a signal from W stage to override the PC increment.
    reg        branch_taken_w;
    reg [4:0]  branch_target_w;

    // F Stage Logic
    always @(*) begin
        if (halt_out) begin
            npc_f = pc_out; // Freeze PC
            take_branch_f = 0;
        end else if (valid_w && is_branch_w && branch_taken_w) begin
            npc_f = branch_target_w;
            take_branch_f = 1;
        end else begin
            npc_f = pc_out + 1;
            take_branch_f = 0;
        end
    end

    // X Stage Logic
    always @(*) begin
        // Default
        is_halt_x = 0;
        is_branch_x = 0;
        is_reg_write_x_int = 0;
        rd_x_int = 2'b00;
        rs_x_int = 2'b00;
        result_x_int = 8'sb0;
        imm_x = 4'b0000;
        cond_x = 2'b00;
        off_x = 4'b0000;
        branch_taken_x = 0;
        rs_fwd = 8'sb0;

        if (!valid_x) begin
            // No instruction
        end else begin
            case (instr_x[7:6])
                2'b00: begin // ADDI
                    is_reg_write_x_int = 1;
                    rd_x_int = instr_x[5:4];
                    imm_x = instr_x[3:0];
                    // R[rd] = R[rd] + sign_ext(imm)
                    // Forwarding for R[rd] (which is the same as rs for ADDI? No, ADDI only writes rd, reads nothing from R)
                    // ADDI reads nothing from R, so no forwarding needed for rs.
                    result_x_int = R[rd_x_int] + sext4(imm_x);
                end
                2'b01: begin // MOV
                    is_reg_write_x_int = 1;
                    rd_x_int = instr_x[5:4];
                    rs_x_int = instr_x[3:2];
                    // R[rd] = R[rs]
                    // Forwarding for R[rs]
                    rs_fwd = R[rs_x_int]; // Default
                    // Check forwarding from W stage
                    if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
                        rs_fwd = result_w;
                    end
                    // Check forwarding from X stage (self-forwarding? No, rs and rd are different usually, but if same, use current R)
                    // If rd_x == rs_x, we should use the current value from R, not the result of the same instruction.
                    // But in a pipeline, the X stage result is not yet committed. So we use R[rs].
                    // However, if the previous instruction (in W) wrote to rs, we forward from W.
                    // If the previous instruction (in X) wrote to rs, we forward from X.
                    if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
                        rs_fwd = result_w;
                    end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                        // Self-forward: use current R value
                        rs_fwd = R[rs_x_int];
                    end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                         // This case is covered above.
                    end else begin
                        // Check X stage forwarding for rs
                        if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                            rs_fwd = R[rs_x_int]; // Use current R if rd==rs
                        end else begin
                            rs_fwd = R[rs_x_int];
                        end
                    end
                    // Re-evaluate forwarding logic more cleanly:
                    // Priority: W stage > X stage > Register File
                    if (valid_w && is_reg_write_w && (rd_w == rs_x_int) && !is_halt_w) begin
                        rs_fwd = result_w;
                    end else if (is_reg_write_x_int && (rd_x_int == rs_x_int)) begin
                        // If the current instruction writes to rd and reads from rs, and rd==rs,
                        // we should use the current value from R, not the result of the current instruction.
                        rs_fwd = R[rs_x_int];
                    end else begin
                        rs_fwd = R[rs_x_int];
                    end
                    result_x_int = rs_fwd;
                end
                2'b10: begin // BRcc
                    is_branch_x = 1;
                    cond_x = instr_x[5:4];
                    off_x = instr_x[3:0];
                    // Branch condition
                    case (cond_x)
                        2'b00: branch_taken_x = 1; // Always
                        2'b01: branch_taken_x = (R[0] == 8'sb0); // Zero
                        2'b10: branch_taken_x = (R[0] < 8'sb0); // Negative
                        2'b11: branch_taken_x = (R[0] > 8'sb0); // Positive
                        default: branch_taken_x = 0;
                    endcase
                    // Compute branch target: PC + sign_ext(off) + 1
                    // PC for this instruction is pc_x
                    result_x_int = pc_x + sext4_off(off_x) + 1;
                end
                2'b11: begin // HALT
                    is_halt_x = 1;
                end
                default: begin
                    // NOP or invalid
                end
            endcase
        end
    end

    // W Stage Logic
    always @(*) begin
        is_halt_w = 0;
        is_branch_w = 0;
        is_reg_write_w = 0;
        rd_w = 2'b00;
        result_w = 8'sb0;
        branch_taken_w = 0;
        branch_target_w = 5'b00000;

        if (!valid_w) begin
            // No instruction
        end else begin
            if (is_halt_x) begin
                is_halt_w = 1;
            end else if (is_branch_x) begin
                is_branch_w = 1;
                branch_taken_w = branch_taken_x;
                branch_target_w = result_x_int; // This is the target PC
            end else if (is_reg_write_x_int) begin
                is_reg_write_w = 1;
                rd_w = rd_x_int;
                result_w = result_x_int;
            end
        end
    end

    // Pipeline Register Updates
    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 5'b00000;
            halt_out <= 1'b0;
            r0_out <= 8'sb0;
            for (int i = 0; i < 4; i++) R[i] <= 8'sb0;
            valid_x <= 1'b0;
            valid_w <= 1'b0;
            instr_x <= 8'b0;
            pc_x <= 5'b0;
            instr_w <= 8'b0;
            pc_w <= 5'b0;
            is_halt_w <= 1'b0;
            is_reg_write_w <= 1'b0;
            rd_w <= 2'b0;
            result_w <= 8'sb0;
            is_reg_write_x <= 1'b0;
            rd_x <= 2'b0;
            result_x <= 8'sb0;
        end else if (halt_out) begin
            // Freeze everything
            pc_out <= pc_out;
            halt_out <= 1'b1;
            r0_out <= r0_out;
            // R remains unchanged
            valid_x <= 1'b0;
            valid_w <= 1'b0;
            // Keep other regs stable
        end else begin
            // Update W stage
            valid_w <= valid_x;
            instr_w <= instr_x;
            pc_w <= pc_x;
            is_halt_w <= is_halt_x;
            is_reg_write_w <= is_reg_write_x_int;
            rd_w <= rd_x_int;
            result_w <= result_x_int;
            branch_taken_w <= branch_taken_x;
            branch_target_w <= result_x_int; // For branch, result_x_int is target

            // Update X stage
            valid_x <= valid_f; // valid_f is always 1 unless halt
            instr_x <= imem_data;
            pc_x <= npc_f;

            // Update F stage PC
            pc_out <= npc_f;

            // Update Register File
            if (valid_w && is_reg_write_w) begin
                R[rd_w] <= result_w;
            end

            // Update r0_out
            if (valid_w && is_reg_write_w && rd_w == 2'b00) begin
                r0_out <= result_w;
            end

            // Update halt_out
            if (valid_w && is_halt_w) begin
                halt_out <= 1'b1;
            end

            // Update forwarding regs for next cycle's X stage
            is_reg_write_x <= is_reg_write_w;
            rd_x <= rd_w;
            result_x <= result_w;
        end
    end

    // Valid F signal: always valid unless halt
    wire valid_f = ~halt_out;

endmodule
```

line-item PPA

per-program functional

pipeline shape (heuristic register bank count)

mistakes detected

extracted RTL

raw model output