sft_6

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC candidate for fetch stage (PC+1)

    // Stage X: Decode/Execute
    reg [7:0] instr_x;
    reg [4:0] pc_x;
    reg [4:0] npc_x; // Next PC candidate from previous stage
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [1:0] cond_x;
    reg [3:0] off_x;
    reg signed [7:0] rs_val_x; // Read value of rs
    reg signed [7:0] alu_res_x; // Result of ALU operation
    reg branch_taken_x;
    reg [4:0] branch_target_x;
    reg is_halt_x;

    // Stage W: Writeback
    reg [7:0] instr_w;
    reg [1:0] rd_w;
    reg signed [7:0] wb_data_w;
    reg is_halt_w;
    reg valid_w;

    // Helper: Sign extend 4-bit to 8-bit
    function signed [7:0] sign_ext4;
        input [3:0] val;
        begin
            sign_ext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: Sign extend 4-bit offset to 5-bit for PC addition
    function signed [5:0] sign_ext4_pc;
        input [3:0] val;
        begin
            sign_ext4_pc = {{6{val[3]}}, val};
        end
    endfunction

    // Forwarding Logic
    // We need to forward results from X stage to X stage inputs if the rd of X matches rs of current X.
    // Since we don't have a separate WB register for forwarding source other than X and W,
    // and W is the previous instruction's result, we can forward from W if valid.
    // However, the problem states "Forwarding REQUIRED for back-to-back ADDI / MOV".
    // This implies if Instr[i] writes R[d] and Instr[i+1] reads R[s]==R[d], we must forward.
    // In a 3-stage pipeline:
    // Cycle T: Instr[i] is in X, Instr[i+1] is in F
    // Cycle T+1: Instr[i] is in W, Instr[i+1] is in X.
    // So we can forward from W stage to X stage inputs.
    
    reg signed [7:0] fwd_val_x;
    reg fwd_en_x;

    always @(*) begin
        // Default: read from register file
        fwd_en_x = 0;
        fwd_val_x = rf[rs_x];

        // Forward from W stage if:
        // 1. W stage has a valid write to a register
        // 2. The rd in W matches rs in X
        // 3. The instruction in W is not HALT (HALT doesn't write registers)
        if (valid_w && !is_halt_w && (rd_w != 2'b00 || instr_w[7:6] != 2'b00) && rd_w == rs_x) begin
            // Note: MOV and ADDI write to rd. 
            // We check if rd_w is the target register.
            // Special case: if rd_w is 0, it's usually ignored in some ISAs, but here R[0..3] are valid.
            // Let's just forward if rd matches rs.
            fwd_en_x = 1;
            fwd_val_x = wb_data_w;
        end
    end

    // Determine actual rs value for ALU
    reg signed [7:0] rs_eff_x;
    always @(*) begin
        if (fwd_en_x) begin
            rs_eff_x = fwd_val_x;
        end else begin
            rs_eff_x = rf[rs_x];
        end
    end

    // Combinational Decode for X stage inputs (for simulation clarity, though we use regs)
    // The decoding happens when moving from F to X.

    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 0;
            halt_out <= 0;
            r0_out <= 0;
            for (int i = 0; i < 4; i++) rf[i] <= 0;
            
            // Clear Pipeline
            pc_f <= 0;
            instr_f <= 0;
            npc_f <= 0;
            
            instr_x <= 0;
            pc_x <= 0;
            npc_x <= 0;
            opcode_x <= 0;
            rd_x <= 0;
            rs_x <= 0;
            imm_x <= 0;
            cond_x <= 0;
            off_x <= 0;
            rs_val_x <= 0;
            alu_res_x <= 0;
            branch_taken_x <= 0;
            branch_target_x <= 0;
            is_halt_x <= 0;
            
            instr_w <= 0;
            rd_w <= 0;
            wb_data_w <= 0;
            is_halt_w <= 0;
            valid_w <= 0;
        end else begin
            // Update Outputs
            pc_out <= pc_f;
            
            // Update Register File and r0_out based on W stage
            if (valid_w) begin
                if (is_halt_w) begin
                    halt_out <= 1;
                end else begin
                    // Write back to register file
                    if (instr_w[7:6] == 2'b00) begin // ADDI
                        rf[rd_w] <= wb_data_w;
                    end else if (instr_w[7:6] == 2'b01) begin // MOV
                        rf[rd_w] <= wb_data_w;
                    end
                    // HALT and BRcc do not write to RF
                    
                    // Update r0_out if R0 was written
                    if (rd_w == 2'b00) begin
                        r0_out <= wb_data_w;
                    end
                end
            end else begin
                // If pipeline is stalled or cleared, halt_out stays as is (unless reset)
                // If halt_out is already 1, it stays 1.
                if (halt_out) begin
                    halt_out <= 1;
                end
            end

            // Stage W Update
            valid_w <= valid_x; // valid_x indicates X stage had a valid instruction
            instr_w <= instr_x;
            rd_w <= rd_x;
            wb_data_w <= alu_res_x;
            is_halt_w <= is_halt_x;

            // Stage X Update
            // Decode and Execute
            // Inputs come from F stage
            
            // First, determine if we should forward for the X stage calculation
            // The forwarding logic above uses current W stage values.
            
            // Decode Opcode
            opcode_x <= instr_f[7:6];
            
            if (instr_f[7:6] == 2'b00) begin // ADDI
                rd_x <= instr_f[5:4];
                // rs is not used in ADDI, but we can set it to 0
                rs_x <= 2'b00; 
                imm_x <= instr_f[3:0];
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                
                // Execute: R[rd] <= R[rd] + sign_ext(imm, 4)
                // Note: ADDI reads R[rd] as the source register too.
                // So we need to forward to the 'rd' register value.
                // Let's treat rs_x as rd_x for the purpose of reading the base value.
                rs_x <= instr_f[5:4]; // Actually, we need to read R[rd]
                
                // Re-evaluate rs_eff_x for ADDI: it should be R[rd]
                // The forwarding logic checks rs_x. So we set rs_x to rd.
                
                // Calculate ALU Result
                // We need the current value of R[rd_x].
                // We use rs_eff_x which is forwarded R[rs_x]. Since rs_x is set to rd, this works.
                alu_res_x <= rs_eff_x + sign_ext4(imm_x);
                
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 0;
                valid_x <= 1; // Valid instruction
                
            end else if (instr_f[7:6] == 2'b01) begin // MOV
                rd_x <= instr_f[5:4];
                rs_x <= instr_f[3:2];
                imm_x <= 4'b0000;
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                
                // Execute: R[rd] <= R[rs]
                alu_res_x <= rs_eff_x;
                
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 0;
                valid_x <= 1;
                
            end else if (instr_f[7:6] == 2'b10) begin // BRcc
                cond_x <= instr_f[5:4];
                off_x <= instr_f[3:0];
                rd_x <= 2'b00; // Unused
                rs_x <= 2'b00; // Unused
                
                // Execute: Check condition
                // Condition based on R0
                // 00: always
                // 01: zero(R0)
                // 10: neg(R0)
                // 11: pos(R0)
                
                // We need the current value of R0.
                // Does BRcc need forwarding? Yes, if previous instruction wrote R0.
                // We can use the same forwarding mechanism. Let's set rs_x to 0 to read R0.
                rs_x <= 2'b00;
                
                // Calculate branch target
                // PC <= PC + sign_ext(off,4) + 1
                // The PC used here is the PC of the BRcc instruction itself (pc_x will be updated to pc_f next cycle, but for calculation we use pc_f)
                // Wait, pc_x in the NEXT cycle will be pc_f of THIS cycle.
                // The branch target calculation should use the PC of the branch instruction.
                // In the X stage, we have pc_x which is the PC of the instruction being executed.
                // But we are updating X from F. So the PC of the instruction in X is pc_f.
                
                branch_target_x <= pc_f + sign_ext4_pc(off_x) + 1;
                
                // Determine if branch is taken
                case (cond_x)
                    2'b00: branch_taken_x = 1;
                    2'b01: branch_taken_x = (rs_eff_x == 0) ? 1 : 0;
                    2'b10: branch_taken_x = (rs_eff_x < 0) ? 1 : 0;
                    2'b11: branch_taken_x = (rs_eff_x > 0) ? 1 : 0;
                    default: branch_taken_x = 0;
                endcase
                
                alu_res_x <= 0; // Not used for writeback
                is_halt_x <= 0;
                valid_x <= 1;
                
            end else if (instr_f[7:6] == 2'b11) begin // HALT
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0000;
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                
                alu_res_x <= 0;
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 1;
                valid_x <= 1;
            end else begin
                // Invalid opcode, treat as NOP
                opcode_x <= 2'b00;
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0000;
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                alu_res_x <= 0;
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 0;
                valid_x <= 0; // Invalid instruction, don't writeback
            end
            
            // Update X stage registers
            instr_x <= instr_f;
            pc_x <= pc_f;
            npc_x <= npc_f; // This is the PC+1 of the current instruction
            
            // Stage F Update
            // Determine next PC
            // If the instruction in X stage (which was F stage last cycle) was a branch and taken,
            // then the next PC should be the branch target.
            // However, due to the delay slot, the instruction after the branch (which is currently in F)
            // will execute. The PC after the delay slot should be the branch target.
            // So, if branch_taken_x is true, the next PC (for the instruction after the delay slot) is branch_target_x.
            // But wait, the instruction currently in F is the delay slot instruction.
            // Its PC is npc_f (which is pc_f + 1).
            // After the delay slot executes, the PC should update to branch_target_x.
            // So, if branch_taken_x is true, we set the next PC to branch_target_x.
            // If branch_taken_x is false, we set the next PC to npc_f (which is pc_f + 1, but wait, npc_f is already pc_f+1).
            // Actually, let's trace carefully.
            
            // Cycle T:
            // F: PC=0, Instr=BRcc
            // X: PC=0, Instr=Prev (NOP)
            // W: PC=0, Instr=PrevPrev
            
            // Cycle T+1:
            // F: PC=1 (npc_f from T), Instr=DelaySlot
            // X: PC=0, Instr=BRcc (from F in T)
            // W: PC=0, Instr=Prev (from X in T)
            
            // In Cycle T+1, X stage evaluates BRcc. If taken, branch_target is calculated.
            // The instruction in F (DelaySlot) will execute.
            // After Cycle T+1, we want the next PC (for Cycle T+2 F stage) to be branch_target if taken.
            // If not taken, the next PC should be PC(DelaySlot) + 1 = npc_f + 1? 
            // No, npc_f is PC(F)+1. So if not taken, next PC is npc_f + 1? 
            // Wait, if BRcc is not taken, the PC continues sequentially.
            // The BRcc instruction is at PC=0. The delay slot is at PC=1.
            // After delay slot executes, PC should be 2.
            // npc_f in Cycle T+1 is 1 (from PC=0 in Cycle T).
            // So next PC should be npc_f + 1 = 2.
            
            // If BRcc is taken, next PC should be branch_target_x.
            
            // So, logic for next PC (pc_f_next):
            // if (branch_taken_x) pc_f_next = branch_target_x;
            // else pc_f_next = npc_f + 1;
            
            // But wait, what if the instruction in X is not a branch?
            // Then branch_taken_x is 0.
            // So pc_f_next = npc_f + 1.
            // This is correct for sequential execution.
            
            // What if the instruction in X is HALT?
            // Then we freeze the pipeline. pc_out stays at current pc_f.
            // But halt_out is set in W stage.
            // If is_halt_x is true, we should not update pc_f.
            
            if (is_halt_x) begin
                // Freeze pipeline
                pc_f <= pc_f;
                instr_f <= instr_f;
                npc_f <= npc_f;
            end else if (halt_out) begin
                // Already halted, stay frozen
                pc_f <= pc_f;
                instr_f <= instr_f;
                npc_f <= npc_f;
            end else begin
                if (branch_taken_x) begin
                    pc_f <= branch_target_x;
                end else begin
                    pc_f <= npc_f + 1;
                end
                
                // Fetch instruction
                // IMEM_INIT is 256 bits. PC is 5 bits (0-31).
                // Byte address = PC.
                // IMEM_INIT[PC*8 +: 8]
                instr_f <= IMEM_INIT[pc_f*8 +: 8];
                
                // npc_f is PC+1 for the current instruction
                npc_f <= pc_f + 1;
            end
        end
    end

    // Helper reg for valid_x
    reg valid_x;
    always @(*) begin
        if (opcode_x == 2'b00 || opcode_x == 2'b01 || opcode_x == 2'b10 || opcode_x == 2'b11) begin
            valid_x = 1;
        end else begin
            valid_x = 0;
        end
    end

endmodule

raw model output

```verilog
// Pipeline convention: Branch delay slot. The instruction immediately following a
// BRcc instruction is always fetched and executed, regardless of whether the
// branch is taken. If the branch is taken, the PC updates to the branch target
// after the delay slot instruction completes. This means the delay slot
// instruction executes in the context of the sequential PC, not the branch target.
// Forwarding is implemented from the Execute stage to the Execute stage inputs
// to handle back-to-back ALU/MOV dependencies without stalls.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Registers
    // Stage F: Fetch
    reg [4:0] pc_f;
    reg [7:0] instr_f;
    reg [4:0] npc_f; // Next PC candidate for fetch stage (PC+1)

    // Stage X: Decode/Execute
    reg [7:0] instr_x;
    reg [4:0] pc_x;
    reg [4:0] npc_x; // Next PC candidate from previous stage
    reg [1:0] opcode_x;
    reg [1:0] rd_x;
    reg [1:0] rs_x;
    reg [3:0] imm_x;
    reg [1:0] cond_x;
    reg [3:0] off_x;
    reg signed [7:0] rs_val_x; // Read value of rs
    reg signed [7:0] alu_res_x; // Result of ALU operation
    reg branch_taken_x;
    reg [4:0] branch_target_x;
    reg is_halt_x;

    // Stage W: Writeback
    reg [7:0] instr_w;
    reg [1:0] rd_w;
    reg signed [7:0] wb_data_w;
    reg is_halt_w;
    reg valid_w;

    // Helper: Sign extend 4-bit to 8-bit
    function signed [7:0] sign_ext4;
        input [3:0] val;
        begin
            sign_ext4 = {{4{val[3]}}, val};
        end
    endfunction

    // Helper: Sign extend 4-bit offset to 5-bit for PC addition
    function signed [5:0] sign_ext4_pc;
        input [3:0] val;
        begin
            sign_ext4_pc = {{6{val[3]}}, val};
        end
    endfunction

    // Forwarding Logic
    // We need to forward results from X stage to X stage inputs if the rd of X matches rs of current X.
    // Since we don't have a separate WB register for forwarding source other than X and W,
    // and W is the previous instruction's result, we can forward from W if valid.
    // However, the problem states "Forwarding REQUIRED for back-to-back ADDI / MOV".
    // This implies if Instr[i] writes R[d] and Instr[i+1] reads R[s]==R[d], we must forward.
    // In a 3-stage pipeline:
    // Cycle T: Instr[i] is in X, Instr[i+1] is in F
    // Cycle T+1: Instr[i] is in W, Instr[i+1] is in X.
    // So we can forward from W stage to X stage inputs.
    
    reg signed [7:0] fwd_val_x;
    reg fwd_en_x;

    always @(*) begin
        // Default: read from register file
        fwd_en_x = 0;
        fwd_val_x = rf[rs_x];

        // Forward from W stage if:
        // 1. W stage has a valid write to a register
        // 2. The rd in W matches rs in X
        // 3. The instruction in W is not HALT (HALT doesn't write registers)
        if (valid_w && !is_halt_w && (rd_w != 2'b00 || instr_w[7:6] != 2'b00) && rd_w == rs_x) begin
            // Note: MOV and ADDI write to rd. 
            // We check if rd_w is the target register.
            // Special case: if rd_w is 0, it's usually ignored in some ISAs, but here R[0..3] are valid.
            // Let's just forward if rd matches rs.
            fwd_en_x = 1;
            fwd_val_x = wb_data_w;
        end
    end

    // Determine actual rs value for ALU
    reg signed [7:0] rs_eff_x;
    always @(*) begin
        if (fwd_en_x) begin
            rs_eff_x = fwd_val_x;
        end else begin
            rs_eff_x = rf[rs_x];
        end
    end

    // Combinational Decode for X stage inputs (for simulation clarity, though we use regs)
    // The decoding happens when moving from F to X.

    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 0;
            halt_out <= 0;
            r0_out <= 0;
            for (int i = 0; i < 4; i++) rf[i] <= 0;
            
            // Clear Pipeline
            pc_f <= 0;
            instr_f <= 0;
            npc_f <= 0;
            
            instr_x <= 0;
            pc_x <= 0;
            npc_x <= 0;
            opcode_x <= 0;
            rd_x <= 0;
            rs_x <= 0;
            imm_x <= 0;
            cond_x <= 0;
            off_x <= 0;
            rs_val_x <= 0;
            alu_res_x <= 0;
            branch_taken_x <= 0;
            branch_target_x <= 0;
            is_halt_x <= 0;
            
            instr_w <= 0;
            rd_w <= 0;
            wb_data_w <= 0;
            is_halt_w <= 0;
            valid_w <= 0;
        end else begin
            // Update Outputs
            pc_out <= pc_f;
            
            // Update Register File and r0_out based on W stage
            if (valid_w) begin
                if (is_halt_w) begin
                    halt_out <= 1;
                end else begin
                    // Write back to register file
                    if (instr_w[7:6] == 2'b00) begin // ADDI
                        rf[rd_w] <= wb_data_w;
                    end else if (instr_w[7:6] == 2'b01) begin // MOV
                        rf[rd_w] <= wb_data_w;
                    end
                    // HALT and BRcc do not write to RF
                    
                    // Update r0_out if R0 was written
                    if (rd_w == 2'b00) begin
                        r0_out <= wb_data_w;
                    end
                end
            end else begin
                // If pipeline is stalled or cleared, halt_out stays as is (unless reset)
                // If halt_out is already 1, it stays 1.
                if (halt_out) begin
                    halt_out <= 1;
                end
            end

            // Stage W Update
            valid_w <= valid_x; // valid_x indicates X stage had a valid instruction
            instr_w <= instr_x;
            rd_w <= rd_x;
            wb_data_w <= alu_res_x;
            is_halt_w <= is_halt_x;

            // Stage X Update
            // Decode and Execute
            // Inputs come from F stage
            
            // First, determine if we should forward for the X stage calculation
            // The forwarding logic above uses current W stage values.
            
            // Decode Opcode
            opcode_x <= instr_f[7:6];
            
            if (instr_f[7:6] == 2'b00) begin // ADDI
                rd_x <= instr_f[5:4];
                // rs is not used in ADDI, but we can set it to 0
                rs_x <= 2'b00; 
                imm_x <= instr_f[3:0];
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                
                // Execute: R[rd] <= R[rd] + sign_ext(imm, 4)
                // Note: ADDI reads R[rd] as the source register too.
                // So we need to forward to the 'rd' register value.
                // Let's treat rs_x as rd_x for the purpose of reading the base value.
                rs_x <= instr_f[5:4]; // Actually, we need to read R[rd]
                
                // Re-evaluate rs_eff_x for ADDI: it should be R[rd]
                // The forwarding logic checks rs_x. So we set rs_x to rd.
                
                // Calculate ALU Result
                // We need the current value of R[rd_x].
                // We use rs_eff_x which is forwarded R[rs_x]. Since rs_x is set to rd, this works.
                alu_res_x <= rs_eff_x + sign_ext4(imm_x);
                
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 0;
                valid_x <= 1; // Valid instruction
                
            end else if (instr_f[7:6] == 2'b01) begin // MOV
                rd_x <= instr_f[5:4];
                rs_x <= instr_f[3:2];
                imm_x <= 4'b0000;
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                
                // Execute: R[rd] <= R[rs]
                alu_res_x <= rs_eff_x;
                
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 0;
                valid_x <= 1;
                
            end else if (instr_f[7:6] == 2'b10) begin // BRcc
                cond_x <= instr_f[5:4];
                off_x <= instr_f[3:0];
                rd_x <= 2'b00; // Unused
                rs_x <= 2'b00; // Unused
                
                // Execute: Check condition
                // Condition based on R0
                // 00: always
                // 01: zero(R0)
                // 10: neg(R0)
                // 11: pos(R0)
                
                // We need the current value of R0.
                // Does BRcc need forwarding? Yes, if previous instruction wrote R0.
                // We can use the same forwarding mechanism. Let's set rs_x to 0 to read R0.
                rs_x <= 2'b00;
                
                // Calculate branch target
                // PC <= PC + sign_ext(off,4) + 1
                // The PC used here is the PC of the BRcc instruction itself (pc_x will be updated to pc_f next cycle, but for calculation we use pc_f)
                // Wait, pc_x in the NEXT cycle will be pc_f of THIS cycle.
                // The branch target calculation should use the PC of the branch instruction.
                // In the X stage, we have pc_x which is the PC of the instruction being executed.
                // But we are updating X from F. So the PC of the instruction in X is pc_f.
                
                branch_target_x <= pc_f + sign_ext4_pc(off_x) + 1;
                
                // Determine if branch is taken
                case (cond_x)
                    2'b00: branch_taken_x = 1;
                    2'b01: branch_taken_x = (rs_eff_x == 0) ? 1 : 0;
                    2'b10: branch_taken_x = (rs_eff_x < 0) ? 1 : 0;
                    2'b11: branch_taken_x = (rs_eff_x > 0) ? 1 : 0;
                    default: branch_taken_x = 0;
                endcase
                
                alu_res_x <= 0; // Not used for writeback
                is_halt_x <= 0;
                valid_x <= 1;
                
            end else if (instr_f[7:6] == 2'b11) begin // HALT
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0000;
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                
                alu_res_x <= 0;
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 1;
                valid_x <= 1;
            end else begin
                // Invalid opcode, treat as NOP
                opcode_x <= 2'b00;
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0000;
                cond_x <= 2'b00;
                off_x <= 4'b0000;
                alu_res_x <= 0;
                branch_taken_x <= 0;
                branch_target_x <= 0;
                is_halt_x <= 0;
                valid_x <= 0; // Invalid instruction, don't writeback
            end
            
            // Update X stage registers
            instr_x <= instr_f;
            pc_x <= pc_f;
            npc_x <= npc_f; // This is the PC+1 of the current instruction
            
            // Stage F Update
            // Determine next PC
            // If the instruction in X stage (which was F stage last cycle) was a branch and taken,
            // then the next PC should be the branch target.
            // However, due to the delay slot, the instruction after the branch (which is currently in F)
            // will execute. The PC after the delay slot should be the branch target.
            // So, if branch_taken_x is true, the next PC (for the instruction after the delay slot) is branch_target_x.
            // But wait, the instruction currently in F is the delay slot instruction.
            // Its PC is npc_f (which is pc_f + 1).
            // After the delay slot executes, the PC should update to branch_target_x.
            // So, if branch_taken_x is true, we set the next PC to branch_target_x.
            // If branch_taken_x is false, we set the next PC to npc_f (which is pc_f + 1, but wait, npc_f is already pc_f+1).
            // Actually, let's trace carefully.
            
            // Cycle T:
            // F: PC=0, Instr=BRcc
            // X: PC=0, Instr=Prev (NOP)
            // W: PC=0, Instr=PrevPrev
            
            // Cycle T+1:
            // F: PC=1 (npc_f from T), Instr=DelaySlot
            // X: PC=0, Instr=BRcc (from F in T)
            // W: PC=0, Instr=Prev (from X in T)
            
            // In Cycle T+1, X stage evaluates BRcc. If taken, branch_target is calculated.
            // The instruction in F (DelaySlot) will execute.
            // After Cycle T+1, we want the next PC (for Cycle T+2 F stage) to be branch_target if taken.
            // If not taken, the next PC should be PC(DelaySlot) + 1 = npc_f + 1? 
            // No, npc_f is PC(F)+1. So if not taken, next PC is npc_f + 1? 
            // Wait, if BRcc is not taken, the PC continues sequentially.
            // The BRcc instruction is at PC=0. The delay slot is at PC=1.
            // After delay slot executes, PC should be 2.
            // npc_f in Cycle T+1 is 1 (from PC=0 in Cycle T).
            // So next PC should be npc_f + 1 = 2.
            
            // If BRcc is taken, next PC should be branch_target_x.
            
            // So, logic for next PC (pc_f_next):
            // if (branch_taken_x) pc_f_next = branch_target_x;
            // else pc_f_next = npc_f + 1;
            
            // But wait, what if the instruction in X is not a branch?
            // Then branch_taken_x is 0.
            // So pc_f_next = npc_f + 1.
            // This is correct for sequential execution.
            
            // What if the instruction in X is HALT?
            // Then we freeze the pipeline. pc_out stays at current pc_f.
            // But halt_out is set in W stage.
            // If is_halt_x is true, we should not update pc_f.
            
            if (is_halt_x) begin
                // Freeze pipeline
                pc_f <= pc_f;
                instr_f <= instr_f;
                npc_f <= npc_f;
            end else if (halt_out) begin
                // Already halted, stay frozen
                pc_f <= pc_f;
                instr_f <= instr_f;
                npc_f <= npc_f;
            end else begin
                if (branch_taken_x) begin
                    pc_f <= branch_target_x;
                end else begin
                    pc_f <= npc_f + 1;
                end
                
                // Fetch instruction
                // IMEM_INIT is 256 bits. PC is 5 bits (0-31).
                // Byte address = PC.
                // IMEM_INIT[PC*8 +: 8]
                instr_f <= IMEM_INIT[pc_f*8 +: 8];
                
                // npc_f is PC+1 for the current instruction
                npc_f <= pc_f + 1;
            end
        end
    end

    // Helper reg for valid_x
    reg valid_x;
    always @(*) begin
        if (opcode_x == 2'b00 || opcode_x == 2'b01 || opcode_x == 2'b10 || opcode_x == 2'b11) begin
            valid_x = 1;
        end else begin
            valid_x = 0;
        end
    end

endmodule
```

line-item PPA

per-program functional

pipeline shape (heuristic register bank count)

mistakes detected

extracted RTL

raw model output