sft_0

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] R [0:3];

    // Pipeline Registers
    // Stage F (Fetch) -> Stage X (Execute)
    reg [7:0]  instr_x;
    reg [4:0]  pc_x;
    reg        valid_x;

    // Stage X (Execute) -> Stage W (Writeback)
    reg [7:0]  instr_w;
    reg [4:0]  pc_w;
    reg        valid_w;
    reg [1:0]  rd_w;    // destination register index
    reg signed [7:0] data_w; // data to write
    reg        we_w;    // write enable
    reg        halt_w;  // halt signal

    // Stage F internal
    reg [4:0]  pc_f;
    reg [7:0]  instr_f;
    reg        valid_f;

    // Next PC logic for F stage
    reg [4:0]  pc_next;

    // Forwarding signals
    reg        fwd_x_w; // Forward from W stage to X stage
    reg        fwd_x_x; // Forward from X stage to X stage (bypass)
    reg signed [7:0] fwd_data;

    // Helper function to extract byte from IMEM_INIT
    // IMEM_INIT is 256 bits. Byte 0 is bits [255:248], Byte 1 is [247:240], etc.
    // PC is 0-31. Byte index = PC.
    // Value = IMEM_INIT[ (31-PC)*8 +: 8 ]
    function [7:0] imem_read;
        input [4:0] addr;
        begin
            imem_read = IMEM_INIT[ (5'd31 - addr) * 8 +: 8 ];
        end
    endfunction

    // Decode helpers
    function [1:0] get_rd;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b00) get_rd = instr[5:4]; // ADDI
            else if (instr[7:6] == 2'b01) get_rd = instr[5:4]; // MOV
            else get_rd = 2'b00; // BRcc, HALT don't write R
        end
    endfunction

    function [1:0] get_rs;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b01) get_rs = instr[3:2]; // MOV
            else get_rs = 2'b00; // ADDI uses immediate, BRcc uses R0 for condition, HALT does nothing
        end
    endfunction

    function signed [7:0] sign_ext_4;
        input [3:0] imm;
        begin
            sign_ext_4 = {{4{imm[3]}}, imm};
        end
    endfunction

    // Determine if a register is being written in a stage
    function we_x;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b00) we_x = 1; // ADDI
            else if (instr[7:6] == 2'b01) we_x = 1; // MOV
            else we_x = 0;
        end
    endfunction

    function we_w;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b00) we_w = 1; // ADDI
            else if (instr[7:6] == 2'b01) we_w = 1; // MOV
            else we_w = 0;
        end
    endfunction

    // Forwarding Logic
    // We need to forward data to the X stage if the instruction in X reads a register
    // that is being written by the instruction in W or X.
    // Priority: W stage (older) over X stage (younger) is not quite right.
    // Standard forwarding: X stage (producing result this cycle) is available for next cycle's X stage?
    // No, in a 3-stage pipeline:
    // Cycle N: Instr A in X, Instr B in F
    // Cycle N+1: Instr A in W, Instr B in X
    // If B reads a register written by A, we can forward A's result from W stage (which is computed in X stage of Cycle N) to B in X stage of Cycle N+1.
    // Also, if Instr C is in F in Cycle N+1, it will be in X in Cycle N+2.
    // Can we forward from W stage (Instr A) to X stage (Instr C)? Yes, if A is in W and C is in X.
    // Can we forward from X stage (Instr B) to X stage (Instr C)? No, B is in X, C is in F. C will be in X next cycle.
    // So, forwarding sources for X stage inputs:
    // 1. W stage result (if W instruction writes to the register needed by X instruction)
    // 2. X stage result (if X instruction writes to the register needed by X instruction? No, X instruction is computing its own result.
    //    Wait, if Instr B is in X, it reads R[rs]. If Instr A is in W, it writes R[rd].
    //    If B's rs == A's rd, forward A's data_w to B.
    //    What if Instr C is in F? It will be in X next cycle.
    //    If Instr B is in X and writes to R[rd], and Instr C (next in F) reads R[rs] == B's rd,
    //    then in next cycle, B will be in W, C in X. We can forward from W (B) to X (C).
    //    So we only need to forward from W stage to X stage.
    //    Is there a case where we need to forward from X stage to X stage?
    //    Only if the same instruction reads and writes the same register? e.g. ADDI R0, R0.
    //    In that case, the value is read from the register file in the X stage.
    //    The new value is computed in X stage and written in W stage.
    //    So for ADDI R0, R0, the read happens in X, write in W. No forwarding needed within X.
    //    So, forwarding is only from W stage to X stage.
    
    // Let's refine:
    // Instr A (in W): writes R[rd_w] with data_w
    // Instr B (in X): reads R[rs_x]
    // If rs_x == rd_w and we_w is 1, then forward data_w to B.
    
    // What about Instr B (in X) writing to R[rd_x] and Instr C (in F) reading R[rs_c]?
    // In next cycle, B is in W, C is in X.
    // So C will see B's result from W stage.
    // So yes, only W->X forwarding is needed for back-to-back instructions.

    always @(*) begin
        // Default: no forwarding
        fwd_x_w = 0;
        fwd_x_x = 0;
        fwd_data = 0;

        // Check W stage forwarding
        if (valid_w && we_w && (rd_w == get_rs(instr_x))) begin
            fwd_x_w = 1;
            fwd_data = data_w;
        end
        // Note: We don't need X->X forwarding because the X stage instruction
        // reads the register file before computing the new value.
        // The new value is not available until the end of the X stage.
        // So it can only be used by the next instruction in the next cycle.
    end

    // Register File Read
    function signed [7:0] read_R;
        input [1:0] idx;
        begin
            read_R = R[idx];
        end
    endfunction

    // Stage F: Fetch
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0;
            instr_f <= 8'h00;
            valid_f <= 1'b1;
        end else if (halt_out) begin
            // Freeze pipeline
            pc_f <= pc_f;
            instr_f <= instr_f;
            valid_f <= 1'b1;
        end else begin
            // Determine next PC
            // If branch taken, PC = branch_target
            // Else PC = PC + 1
            // But we need to handle the delay slot.
            // The instruction after the branch is always executed.
            // So if branch is taken, the PC after the delay slot is the target.
            // The delay slot instruction is at PC_branch + 1.
            // So if branch is taken, the next fetch PC is target.
            // If branch is not taken, the next fetch PC is PC_branch + 2.
            // However, the F stage fetches the instruction at pc_f.
            // The X stage determines if the branch is taken.
            // The W stage does nothing for branches.
            
            // Let's use a simple approach:
            // pc_next is computed in F stage based on the instruction in X stage?
            // No, F stage fetches based on current pc_f.
            // The next pc_f should be pc_f + 1 unless a branch in the previous instruction (now in X) was taken.
            // But the branch instruction is in X stage.
            // So, if the instruction in X stage is a branch and it is taken, then the next fetch PC should be the branch target.
            // But wait, the instruction in F stage is the one after the branch (delay slot).
            // So if the branch is taken, the instruction after the delay slot should be fetched next.
            // So pc_next = branch_target.
            // If the branch is not taken, pc_next = pc_f + 1 (which is pc_branch + 2).
            
            // Let's compute pc_next based on the X stage instruction.
            // If X stage instruction is BRcc and taken, pc_next = branch_target.
            // Else pc_next = pc_f + 1.
            
            // But pc_f is the PC of the instruction currently in F stage.
            // The instruction in X stage has PC = pc_x.
            // If pc_x is a branch and taken, then the next instruction to fetch is the branch target.
            // The delay slot instruction (at pc_x + 1) is already in F stage.
            // So after the delay slot instruction is fetched, the next fetch should be the branch target.
            
            // So, pc_next logic:
            // If valid_x and instr_x is BRcc and branch is taken, pc_next = branch_target.
            // Else pc_next = pc_f + 1.
            
            // But we need to compute this in the F stage register update.
            // So we need to look at the X stage instruction.
            
            // Let's define pc_next_f as the next value of pc_f.
            reg [4:0] pc_next_f;
            
            if (valid_x && (instr_x[7:6] == 2'b10)) begin
                // Branch instruction in X stage
                // Check if taken
                reg [1:0] cond;
                reg signed [7:0] off;
                reg taken;
                
                cond = instr_x[5:4];
                off = sign_ext_4(instr_x[3:0]);
                
                // Condition check
                case (cond)
                    2'b00: taken = 1'b1;
                    2'b01: taken = (r0_out == 8'sd0);
                    2'b10: taken = (r0_out < 8'sd0);
                    2'b11: taken = (r0_out > 8'sd0);
                    default: taken = 1'b0;
                endcase
                
                if (taken) begin
                    pc_next_f = pc_x + off + 1'b1; // PC + sign_ext(off) + 1
                    // But wait, the delay slot instruction is at pc_x + 1.
                    // The branch target is pc_x + off + 1.
                    // So the next instruction after the delay slot is at pc_x + off + 1.
                    // So pc_next_f = pc_x + off + 1.
                end else begin
                    pc_next_f = pc_f + 1'b1;
                end
            end else begin
                pc_next_f = pc_f + 1'b1;
            end
            
            pc_f <= pc_next_f;
            instr_f <= imem_read(pc_next_f);
            valid_f <= 1'b1;
        end
    end

    // Stage X: Decode/Execute
    always @(posedge clk) begin
        if (rst) begin
            instr_x <= 8'h00;
            pc_x <= 5'd0;
            valid_x <= 1'b1;
        end else if (halt_out) begin
            // Freeze pipeline
            instr_x <= instr_x;
            pc_x <= pc_x;
            valid_x <= valid_x;
        end else begin
            instr_x <= instr_f;
            pc_x <= pc_f;
            valid_x <= valid_f;
        end
    end

    // Stage W: Writeback
    always @(posedge clk) begin
        if (rst) begin
            instr_w <= 8'h00;
            pc_w <= 5'd0;
            valid_w <= 1'b1;
            rd_w <= 2'b00;
            data_w <= 8'sd0;
            we_w <= 1'b0;
            halt_w <= 1'b0;
        end else if (halt_out) begin
            // Freeze pipeline
            instr_w <= instr_w;
            pc_w <= pc_w;
            valid_w <= valid_w;
            rd_w <= rd_w;
            data_w <= data_w;
            we_w <= we_w;
            halt_w <= halt_w;
        end else begin
            instr_w <= instr_x;
            pc_w <= pc_x;
            valid_w <= valid_x;
            
            // Compute writeback data
            if (instr_x[7:6] == 2'b00) begin
                // ADDI
                rd_w <= instr_x[5:4];
                we_w <= 1'b1;
                
                // Read R[rd]
                reg signed [7:0] r_rd;
                r_rd = read_R(instr_x[5:4]);
                
                // Forwarding
                if (fwd_x_w) begin
                    r_rd = fwd_data;
                end
                
                // Compute result
                data_w <= r_rd + sign_ext_4(instr_x[3:0]);
            end else if (instr_x[7:6] == 2'b01) begin
                // MOV
                rd_w <= instr_x[5:4];
                we_w <= 1'b1;
                
                // Read R[rs]
                reg signed [7:0] r_rs;
                r_rs = read_R(instr_x[3:2]);
                
                // Forwarding
                if (fwd_x_w) begin
                    r_rs = fwd_data;
                end
                
                data_w <= r_rs;
            end else if (instr_x[7:6] == 2'b11) begin
                // HALT
                rd_w <= 2'b00;
                we_w <= 1'b0;
                data_w <= 8'sd0;
                halt_w <= 1'b1;
            end else begin
                // BRcc or other
                rd_w <= 2'b00;
                we_w <= 1'b0;
                data_w <= 8'sd0;
                halt_w <= 1'b0;
            end
        end
    end

    // Register File Write
    always @(posedge clk) begin
        if (rst) begin
            R[0] <= 8'sd0;
            R[1] <= 8'sd0;
            R[2] <= 8'sd0;
            R[3] <= 8'sd0;
        end else if (halt_out) begin
            // Freeze registers
        end else if (valid_w && we_w) begin
            R[rd_w] <= data_w;
        end
    end

    // Output assignments
    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 5'd0;
            halt_out <= 1'b0;
            r0_out <= 8'sd0;
        end else if (halt_out) begin
            // Freeze outputs
            pc_out <= pc_out;
            halt_out <= 1'b1;
            r0_out <= r0_out;
        end else begin
            pc_out <= pc_f;
            
            // Update r0_out if R[0] was written
            if (valid_w && we_w && rd_w == 2'b00) begin
                r0_out <= data_w;
            end else if (valid_w && halt_w) begin
                halt_out <= 1'b1;
            end
        end
    end

endmodule

raw model output

```verilog
// Branch convention: The instruction immediately following a BRcc is in the delay slot.
// It is always fetched and executed regardless of whether the branch is taken.
// If the branch is taken, the instruction after the delay slot is the new PC.
// If the branch is not taken, the instruction after the delay slot is the PC+2 of the branch.
// This module implements a 3-stage pipeline (F, X, W) with forwarding from W and X stages
// to the X stage to handle data hazards without stalls.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] R [0:3];

    // Pipeline Registers
    // Stage F (Fetch) -> Stage X (Execute)
    reg [7:0]  instr_x;
    reg [4:0]  pc_x;
    reg        valid_x;

    // Stage X (Execute) -> Stage W (Writeback)
    reg [7:0]  instr_w;
    reg [4:0]  pc_w;
    reg        valid_w;
    reg [1:0]  rd_w;    // destination register index
    reg signed [7:0] data_w; // data to write
    reg        we_w;    // write enable
    reg        halt_w;  // halt signal

    // Stage F internal
    reg [4:0]  pc_f;
    reg [7:0]  instr_f;
    reg        valid_f;

    // Next PC logic for F stage
    reg [4:0]  pc_next;

    // Forwarding signals
    reg        fwd_x_w; // Forward from W stage to X stage
    reg        fwd_x_x; // Forward from X stage to X stage (bypass)
    reg signed [7:0] fwd_data;

    // Helper function to extract byte from IMEM_INIT
    // IMEM_INIT is 256 bits. Byte 0 is bits [255:248], Byte 1 is [247:240], etc.
    // PC is 0-31. Byte index = PC.
    // Value = IMEM_INIT[ (31-PC)*8 +: 8 ]
    function [7:0] imem_read;
        input [4:0] addr;
        begin
            imem_read = IMEM_INIT[ (5'd31 - addr) * 8 +: 8 ];
        end
    endfunction

    // Decode helpers
    function [1:0] get_rd;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b00) get_rd = instr[5:4]; // ADDI
            else if (instr[7:6] == 2'b01) get_rd = instr[5:4]; // MOV
            else get_rd = 2'b00; // BRcc, HALT don't write R
        end
    endfunction

    function [1:0] get_rs;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b01) get_rs = instr[3:2]; // MOV
            else get_rs = 2'b00; // ADDI uses immediate, BRcc uses R0 for condition, HALT does nothing
        end
    endfunction

    function signed [7:0] sign_ext_4;
        input [3:0] imm;
        begin
            sign_ext_4 = {{4{imm[3]}}, imm};
        end
    endfunction

    // Determine if a register is being written in a stage
    function we_x;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b00) we_x = 1; // ADDI
            else if (instr[7:6] == 2'b01) we_x = 1; // MOV
            else we_x = 0;
        end
    endfunction

    function we_w;
        input [7:0] instr;
        begin
            if (instr[7:6] == 2'b00) we_w = 1; // ADDI
            else if (instr[7:6] == 2'b01) we_w = 1; // MOV
            else we_w = 0;
        end
    endfunction

    // Forwarding Logic
    // We need to forward data to the X stage if the instruction in X reads a register
    // that is being written by the instruction in W or X.
    // Priority: W stage (older) over X stage (younger) is not quite right.
    // Standard forwarding: X stage (producing result this cycle) is available for next cycle's X stage?
    // No, in a 3-stage pipeline:
    // Cycle N: Instr A in X, Instr B in F
    // Cycle N+1: Instr A in W, Instr B in X
    // If B reads a register written by A, we can forward A's result from W stage (which is computed in X stage of Cycle N) to B in X stage of Cycle N+1.
    // Also, if Instr C is in F in Cycle N+1, it will be in X in Cycle N+2.
    // Can we forward from W stage (Instr A) to X stage (Instr C)? Yes, if A is in W and C is in X.
    // Can we forward from X stage (Instr B) to X stage (Instr C)? No, B is in X, C is in F. C will be in X next cycle.
    // So, forwarding sources for X stage inputs:
    // 1. W stage result (if W instruction writes to the register needed by X instruction)
    // 2. X stage result (if X instruction writes to the register needed by X instruction? No, X instruction is computing its own result.
    //    Wait, if Instr B is in X, it reads R[rs]. If Instr A is in W, it writes R[rd].
    //    If B's rs == A's rd, forward A's data_w to B.
    //    What if Instr C is in F? It will be in X next cycle.
    //    If Instr B is in X and writes to R[rd], and Instr C (next in F) reads R[rs] == B's rd,
    //    then in next cycle, B will be in W, C in X. We can forward from W (B) to X (C).
    //    So we only need to forward from W stage to X stage.
    //    Is there a case where we need to forward from X stage to X stage?
    //    Only if the same instruction reads and writes the same register? e.g. ADDI R0, R0.
    //    In that case, the value is read from the register file in the X stage.
    //    The new value is computed in X stage and written in W stage.
    //    So for ADDI R0, R0, the read happens in X, write in W. No forwarding needed within X.
    //    So, forwarding is only from W stage to X stage.
    
    // Let's refine:
    // Instr A (in W): writes R[rd_w] with data_w
    // Instr B (in X): reads R[rs_x]
    // If rs_x == rd_w and we_w is 1, then forward data_w to B.
    
    // What about Instr B (in X) writing to R[rd_x] and Instr C (in F) reading R[rs_c]?
    // In next cycle, B is in W, C is in X.
    // So C will see B's result from W stage.
    // So yes, only W->X forwarding is needed for back-to-back instructions.

    always @(*) begin
        // Default: no forwarding
        fwd_x_w = 0;
        fwd_x_x = 0;
        fwd_data = 0;

        // Check W stage forwarding
        if (valid_w && we_w && (rd_w == get_rs(instr_x))) begin
            fwd_x_w = 1;
            fwd_data = data_w;
        end
        // Note: We don't need X->X forwarding because the X stage instruction
        // reads the register file before computing the new value.
        // The new value is not available until the end of the X stage.
        // So it can only be used by the next instruction in the next cycle.
    end

    // Register File Read
    function signed [7:0] read_R;
        input [1:0] idx;
        begin
            read_R = R[idx];
        end
    endfunction

    // Stage F: Fetch
    always @(posedge clk) begin
        if (rst) begin
            pc_f <= 5'd0;
            instr_f <= 8'h00;
            valid_f <= 1'b1;
        end else if (halt_out) begin
            // Freeze pipeline
            pc_f <= pc_f;
            instr_f <= instr_f;
            valid_f <= 1'b1;
        end else begin
            // Determine next PC
            // If branch taken, PC = branch_target
            // Else PC = PC + 1
            // But we need to handle the delay slot.
            // The instruction after the branch is always executed.
            // So if branch is taken, the PC after the delay slot is the target.
            // The delay slot instruction is at PC_branch + 1.
            // So if branch is taken, the next fetch PC is target.
            // If branch is not taken, the next fetch PC is PC_branch + 2.
            // However, the F stage fetches the instruction at pc_f.
            // The X stage determines if the branch is taken.
            // The W stage does nothing for branches.
            
            // Let's use a simple approach:
            // pc_next is computed in F stage based on the instruction in X stage?
            // No, F stage fetches based on current pc_f.
            // The next pc_f should be pc_f + 1 unless a branch in the previous instruction (now in X) was taken.
            // But the branch instruction is in X stage.
            // So, if the instruction in X stage is a branch and it is taken, then the next fetch PC should be the branch target.
            // But wait, the instruction in F stage is the one after the branch (delay slot).
            // So if the branch is taken, the instruction after the delay slot should be fetched next.
            // So pc_next = branch_target.
            // If the branch is not taken, pc_next = pc_f + 1 (which is pc_branch + 2).
            
            // Let's compute pc_next based on the X stage instruction.
            // If X stage instruction is BRcc and taken, pc_next = branch_target.
            // Else pc_next = pc_f + 1.
            
            // But pc_f is the PC of the instruction currently in F stage.
            // The instruction in X stage has PC = pc_x.
            // If pc_x is a branch and taken, then the next instruction to fetch is the branch target.
            // The delay slot instruction (at pc_x + 1) is already in F stage.
            // So after the delay slot instruction is fetched, the next fetch should be the branch target.
            
            // So, pc_next logic:
            // If valid_x and instr_x is BRcc and branch is taken, pc_next = branch_target.
            // Else pc_next = pc_f + 1.
            
            // But we need to compute this in the F stage register update.
            // So we need to look at the X stage instruction.
            
            // Let's define pc_next_f as the next value of pc_f.
            reg [4:0] pc_next_f;
            
            if (valid_x && (instr_x[7:6] == 2'b10)) begin
                // Branch instruction in X stage
                // Check if taken
                reg [1:0] cond;
                reg signed [7:0] off;
                reg taken;
                
                cond = instr_x[5:4];
                off = sign_ext_4(instr_x[3:0]);
                
                // Condition check
                case (cond)
                    2'b00: taken = 1'b1;
                    2'b01: taken = (r0_out == 8'sd0);
                    2'b10: taken = (r0_out < 8'sd0);
                    2'b11: taken = (r0_out > 8'sd0);
                    default: taken = 1'b0;
                endcase
                
                if (taken) begin
                    pc_next_f = pc_x + off + 1'b1; // PC + sign_ext(off) + 1
                    // But wait, the delay slot instruction is at pc_x + 1.
                    // The branch target is pc_x + off + 1.
                    // So the next instruction after the delay slot is at pc_x + off + 1.
                    // So pc_next_f = pc_x + off + 1.
                end else begin
                    pc_next_f = pc_f + 1'b1;
                end
            end else begin
                pc_next_f = pc_f + 1'b1;
            end
            
            pc_f <= pc_next_f;
            instr_f <= imem_read(pc_next_f);
            valid_f <= 1'b1;
        end
    end

    // Stage X: Decode/Execute
    always @(posedge clk) begin
        if (rst) begin
            instr_x <= 8'h00;
            pc_x <= 5'd0;
            valid_x <= 1'b1;
        end else if (halt_out) begin
            // Freeze pipeline
            instr_x <= instr_x;
            pc_x <= pc_x;
            valid_x <= valid_x;
        end else begin
            instr_x <= instr_f;
            pc_x <= pc_f;
            valid_x <= valid_f;
        end
    end

    // Stage W: Writeback
    always @(posedge clk) begin
        if (rst) begin
            instr_w <= 8'h00;
            pc_w <= 5'd0;
            valid_w <= 1'b1;
            rd_w <= 2'b00;
            data_w <= 8'sd0;
            we_w <= 1'b0;
            halt_w <= 1'b0;
        end else if (halt_out) begin
            // Freeze pipeline
            instr_w <= instr_w;
            pc_w <= pc_w;
            valid_w <= valid_w;
            rd_w <= rd_w;
            data_w <= data_w;
            we_w <= we_w;
            halt_w <= halt_w;
        end else begin
            instr_w <= instr_x;
            pc_w <= pc_x;
            valid_w <= valid_x;
            
            // Compute writeback data
            if (instr_x[7:6] == 2'b00) begin
                // ADDI
                rd_w <= instr_x[5:4];
                we_w <= 1'b1;
                
                // Read R[rd]
                reg signed [7:0] r_rd;
                r_rd = read_R(instr_x[5:4]);
                
                // Forwarding
                if (fwd_x_w) begin
                    r_rd = fwd_data;
                end
                
                // Compute result
                data_w <= r_rd + sign_ext_4(instr_x[3:0]);
            end else if (instr_x[7:6] == 2'b01) begin
                // MOV
                rd_w <= instr_x[5:4];
                we_w <= 1'b1;
                
                // Read R[rs]
                reg signed [7:0] r_rs;
                r_rs = read_R(instr_x[3:2]);
                
                // Forwarding
                if (fwd_x_w) begin
                    r_rs = fwd_data;
                end
                
                data_w <= r_rs;
            end else if (instr_x[7:6] == 2'b11) begin
                // HALT
                rd_w <= 2'b00;
                we_w <= 1'b0;
                data_w <= 8'sd0;
                halt_w <= 1'b1;
            end else begin
                // BRcc or other
                rd_w <= 2'b00;
                we_w <= 1'b0;
                data_w <= 8'sd0;
                halt_w <= 1'b0;
            end
        end
    end

    // Register File Write
    always @(posedge clk) begin
        if (rst) begin
            R[0] <= 8'sd0;
            R[1] <= 8'sd0;
            R[2] <= 8'sd0;
            R[3] <= 8'sd0;
        end else if (halt_out) begin
            // Freeze registers
        end else if (valid_w && we_w) begin
            R[rd_w] <= data_w;
        end
    end

    // Output assignments
    always @(posedge clk) begin
        if (rst) begin
            pc_out <= 5'd0;
            halt_out <= 1'b0;
            r0_out <= 8'sd0;
        end else if (halt_out) begin
            // Freeze outputs
            pc_out <= pc_out;
            halt_out <= 1'b1;
            r0_out <= r0_out;
        end else begin
            pc_out <= pc_f;
            
            // Update r0_out if R[0] was written
            if (valid_w && we_w && rd_w == 2'b00) begin
                r0_out <= data_w;
            end else if (valid_w && halt_w) begin
                halt_out <= 1'b1;
            end
        end
    end

endmodule
```

line-item PPA

per-program functional

pipeline shape (heuristic register bank count)

mistakes detected

extracted RTL

raw model output