sft_3

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Registers
    // Stage F -> X
    reg [7:0] f_x_instr;
    reg [4:0] f_x_pc;
    reg       f_x_valid;

    // Stage X -> W
    reg [1:0] x_w_op;      // 00: NOP, 01: ADDI, 10: MOV, 11: HALT
    reg [1:0] x_w_rd;
    reg signed [7:0] x_w_result;
    reg [1:0] x_w_cond;
    reg signed [7:0] x_w_r0_val; // R0 value after execution for branch condition
    reg x_w_valid;
    reg x_w_is_branch;
    reg x_w_branch_taken;
    reg [4:0] x_w_branch_target;

    // Stage W -> Commit (Implicit in register file update and output update)
    // We use a simple register file update at the end of W stage.

    // Forwarding Logic
    // We need to forward results from X stage to X stage inputs if the source
    // register matches the destination register of the instruction in X stage.
    // Since we have a 3-stage pipeline, the X stage instruction is the one
    // that was in F stage 1 cycle ago. The W stage instruction is the one
    // that was in X stage 1 cycle ago.
    // Actually, let's look at the timing:
    // Cycle N: Instr A in F, Instr B in X, Instr C in W
    // Cycle N+1: Instr B in X, Instr C in W, Instr D in F
    // Instr B in X might need a register written by Instr A.
    // Instr A is in W stage in Cycle N. Its result is available in x_w_result.
    // So we can forward from W stage to X stage.
    // Also, Instr C in W might write a register that Instr B in X needs? No, B is older.
    // What about Instr D in F? It doesn't execute until Cycle N+2.
    // So the only forwarding needed is from W stage result to X stage operands.

    reg signed [7:0] fwd_val;
    reg fwd_valid;

    // Decode instruction fields
    // F Stage
    wire [7:0] f_instr = IMEM_INIT[f_x_pc*8 +: 8]; // This is not correct for current cycle fetch
    // Let's define the current PC for fetch
    reg [4:0] pc_curr;
    reg [4:0] pc_next;
    
    // The instruction fetched in the current cycle is based on pc_curr.
    // But we need to store it in f_x_instr for the next cycle.
    wire [7:0] instr_fetch = IMEM_INIT[pc_curr*8 +: 8];

    // X Stage Decode
    wire [1:0] x_op = f_x_instr[7:6];
    wire [1:0] x_rd = f_x_instr[5:4];
    wire [3:0] x_imm = f_x_instr[3:0];
    wire [1:0] x_rs = f_x_instr[3:2]; // For MOV, bits 3:2 are rs. For ADDI, bits 3:0 are imm.
    // Wait, the encoding:
    // ADDI: 00 rd[2] imm[4] -> bits 7:6=00, 5:4=rd, 3:0=imm
    // MOV:  01 rd[2] rs[2] _[2] -> bits 7:6=01, 5:4=rd, 3:2=rs, 1:0=xx
    // BRcc: 10 cond[2] off[4] -> bits 7:6=10, 5:4=cond, 3:0=off
    // HALT: 11 _ _ _ _ _ _ _ _ -> bits 7:6=11

    wire signed [7:0] x_imm_sext = {{4{x_imm[3]}}, x_imm};
    wire signed [7:0] x_off_sext = {{4{f_x_instr[3]}}, f_x_instr[3:0]};

    // Read Register File for X stage
    // We need to read R[x_rs] for MOV.
    // We need to read R[x_rd] for ADDI (as source).
    // Forwarding: If x_rd (of W stage) matches the source register of X stage, use W result.
    
    reg signed [7:0] x_rs_val;
    reg signed [7:0] x_rd_val; // For ADDI, rd is also source

    // Forwarding from W stage to X stage
    // W stage has x_w_rd and x_w_result.
    // X stage needs R[x_rs] for MOV and R[x_rd] for ADDI.
    
    // Let's define the forwarding logic in the always block.

    // Branch Condition Evaluation in X stage
    wire x_cond_always = (f_x_instr[5:4] == 2'b00);
    wire x_cond_zero = (f_x_instr[5:4] == 2'b01) && (x_w_r0_val == 0); // Wait, branch condition depends on R0.
    // The branch instruction itself doesn't modify R0. It reads R0.
    // So we need the current value of R0.
    // But R0 might be updated by the instruction in W stage?
    // No, the instruction in W stage is older than the branch in X stage?
    // Let's trace:
    // Cycle N: Instr A (W), Instr B (X), Instr C (F)
    // Instr B is a branch. It needs R0.
    // Instr A is in W stage. It might write R0.
    // If Instr A writes R0, and Instr B reads R0, we should forward A's result to B.
    // So x_w_r0_val should be the value of R0 after Instr A executes.
    // But Instr A is in W stage, so its result is x_w_result.
    // If Instr A writes R0, then x_w_rd == 0, so R0 becomes x_w_result.
    // Otherwise, R0 is rf[0].
    // So the value of R0 for the branch condition is:
    // if (x_w_valid && x_w_rd == 0) ? x_w_result : rf[0];
    // But wait, the branch instruction is in X stage. The W stage instruction is the one
    // that was in X stage last cycle.
    // So yes, we can forward from W to X for the R0 read.

    reg signed [7:0] r0_for_branch;
    wire x_cond_zero_met = (f_x_instr[5:4] == 2'b01) && (r0_for_branch == 0);
    wire x_cond_neg_met = (f_x_instr[5:4] == 2'b10) && (r0_for_branch < 0);
    wire x_cond_pos_met = (f_x_instr[5:4] == 2'b11) && (r0_for_branch > 0);
    wire x_branch_taken = x_cond_always | x_cond_zero_met | x_cond_neg_met | x_cond_pos_met;

    // ALU Result for X stage
    reg signed [7:0] x_alu_result;
    reg [1:0] x_next_op;
    reg [1:0] x_next_rd;
    reg x_next_is_branch;
    reg x_next_branch_taken;
    reg [4:0] x_next_branch_target;
    reg x_next_valid;

    // W Stage Writeback
    // If x_w_valid, write x_w_result to rf[x_w_rd] if op is ADDI or MOV.
    // If op is HALT, set halt.

    // PC Update
    // If branch taken, PC = branch_target.
    // Else, PC = f_x_pc + 1.
    // But wait, the branch target is computed in X stage.
    // The PC for the next fetch is determined by the branch decision.
    // The instruction after the branch (delay slot) is already fetched.
    // So if branch is taken, the next PC is the target.
    // If branch is not taken, the next PC is f_x_pc + 1 + 1? No.
    // Let's trace carefully.
    // Cycle N:
    // F: Fetch Instr C at PC=2
    // X: Decode Instr B at PC=1 (Branch)
    // W: Commit Instr A at PC=0
    // If Instr B is taken, the next instruction to execute after the delay slot (C) is at Target.
    // So in Cycle N+1:
    // F: Fetch Instr D at Target
    // X: Decode Instr C at PC=2
    // W: Commit Instr B (no write)
    // So the PC for fetch in Cycle N+1 should be Target if taken, else PC=3.
    // The branch target is PC_branch + sign_ext(off) + 1.
    // PC_branch is f_x_pc in Cycle N.
    // So target = f_x_pc + x_off_sext + 1.
    
    // Reset
    always @(posedge clk) begin
        if (rst) begin
            pc_curr <= 0;
            f_x_instr <= 0;
            f_x_pc <= 0;
            f_x_valid <= 0;
            x_w_op <= 0;
            x_w_rd <= 0;
            x_w_result <= 0;
            x_w_cond <= 0;
            x_w_r0_val <= 0;
            x_w_valid <= 0;
            x_w_is_branch <= 0;
            x_w_branch_taken <= 0;
            x_w_branch_target <= 0;
            halt_out <= 0;
            pc_out <= 0;
            r0_out <= 0;
            for (int i = 0; i < 4; i++) rf[i] <= 0;
        end else begin
            // Update Register File
            if (x_w_valid) begin
                if (x_w_op == 2'b01 || x_w_op == 2'b10) begin // ADDI or MOV
                    rf[x_w_rd] <= x_w_result;
                end else if (x_w_op == 2'b11) begin // HALT
                    halt_out <= 1;
                end
            end

            // Update Outputs
            pc_out <= pc_curr;
            r0_out <= rf[0];

            // Advance Pipeline
            // F -> X
            f_x_instr <= instr_fetch;
            f_x_pc <= pc_curr;
            f_x_valid <= ~halt_out; // If halt, freeze pipeline

            // X -> W
            x_w_op <= x_next_op;
            x_w_rd <= x_next_rd;
            x_w_result <= x_alu_result;
            x_w_cond <= f_x_instr[5:4];
            x_w_r0_val <= r0_for_branch; // This is the R0 value used for branch condition
            x_w_valid <= x_next_valid;
            x_w_is_branch <= x_next_is_branch;
            x_w_branch_taken <= x_next_branch_taken;
            x_w_branch_target <= x_next_branch_target;

            // Update PC
            if (halt_out) begin
                pc_curr <= pc_curr; // Freeze
            end else if (x_w_is_branch && x_w_branch_taken) begin
                pc_curr <= x_w_branch_target;
            end else begin
                pc_curr <= pc_curr + 1;
            end
        end
    end

    // Combinational Logic for X Stage
    always @(*) begin
        // Default
        x_next_op = 2'b00;
        x_next_rd = 2'b00;
        x_alu_result = 0;
        x_next_valid = 0;
        x_next_is_branch = 0;
        x_next_branch_taken = 0;
        x_next_branch_target = 0;
        r0_for_branch = rf[0];

        // Forwarding for R0 in Branch Condition
        if (f_x_valid) begin
            // Check if W stage writes R0
            if (x_w_valid && x_w_rd == 0 && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
                r0_for_branch = x_w_result;
            end else begin
                r0_for_branch = rf[0];
            end
        end else begin
            r0_for_branch = rf[0];
        end

        if (!f_x_valid) begin
            x_next_valid = 0;
        end else begin
            x_next_valid = 1;
            case (f_x_instr[7:6])
                2'b00: begin // ADDI
                    x_next_op = 2'b01;
                    x_next_rd = f_x_instr[5:4];
                    // Read R[rd]
                    signed [7:0] src_val;
                    // Forwarding for source register
                    if (x_w_valid && x_w_rd == f_x_instr[5:4] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
                        src_val = x_w_result;
                    end else begin
                        src_val = rf[f_x_instr[5:4]];
                    end
                    x_alu_result = src_val + x_imm_sext;
                    x_next_is_branch = 0;
                end
                2'b01: begin // MOV
                    x_next_op = 2'b10;
                    x_next_rd = f_x_instr[5:4];
                    // Read R[rs]
                    signed [7:0] src_val;
                    if (x_w_valid && x_w_rd == f_x_instr[3:2] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
                        src_val = x_w_result;
                    end else begin
                        src_val = rf[f_x_instr[3:2]];
                    end
                    x_alu_result = src_val;
                    x_next_is_branch = 0;
                end
                2'b10: begin // BRcc
                    x_next_op = 2'b00; // No write
                    x_next_rd = 0;
                    x_next_is_branch = 1;
                    x_next_branch_taken = x_branch_taken;
                    x_next_branch_target = f_x_pc + x_off_sext + 1;
                    x_alu_result = 0;
                end
                2'b11: begin // HALT
                    x_next_op = 2'b11;
                    x_next_rd = 0;
                    x_next_is_branch = 0;
                    x_alu_result = 0;
                end
                default: begin
                    x_next_op = 2'b00;
                    x_next_is_branch = 0;
                end
            endcase
        end
    end

endmodule

raw model output

```verilog
// Branch convention: The instruction immediately following the branch instruction
// (at PC_branch + 1) is always executed (1 branch delay slot). The branch target
// instruction is fetched at PC_target. If the branch is not taken, the PC continues
// to PC_branch + 2. This matches the semantics: "if (cond_met) PC <= PC + sign_ext(off,4)+1".
// In a pipelined implementation, this means the fetch stage computes the next PC
// based on the branch decision resolved in the Execute stage. The instruction fetched
// during the cycle the branch is decoded is the delay slot instruction.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

    // Register File: 4 registers, 8-bit signed
    reg signed [7:0] rf [0:3];

    // Pipeline Registers
    // Stage F -> X
    reg [7:0] f_x_instr;
    reg [4:0] f_x_pc;
    reg       f_x_valid;

    // Stage X -> W
    reg [1:0] x_w_op;      // 00: NOP, 01: ADDI, 10: MOV, 11: HALT
    reg [1:0] x_w_rd;
    reg signed [7:0] x_w_result;
    reg [1:0] x_w_cond;
    reg signed [7:0] x_w_r0_val; // R0 value after execution for branch condition
    reg x_w_valid;
    reg x_w_is_branch;
    reg x_w_branch_taken;
    reg [4:0] x_w_branch_target;

    // Stage W -> Commit (Implicit in register file update and output update)
    // We use a simple register file update at the end of W stage.

    // Forwarding Logic
    // We need to forward results from X stage to X stage inputs if the source
    // register matches the destination register of the instruction in X stage.
    // Since we have a 3-stage pipeline, the X stage instruction is the one
    // that was in F stage 1 cycle ago. The W stage instruction is the one
    // that was in X stage 1 cycle ago.
    // Actually, let's look at the timing:
    // Cycle N: Instr A in F, Instr B in X, Instr C in W
    // Cycle N+1: Instr B in X, Instr C in W, Instr D in F
    // Instr B in X might need a register written by Instr A.
    // Instr A is in W stage in Cycle N. Its result is available in x_w_result.
    // So we can forward from W stage to X stage.
    // Also, Instr C in W might write a register that Instr B in X needs? No, B is older.
    // What about Instr D in F? It doesn't execute until Cycle N+2.
    // So the only forwarding needed is from W stage result to X stage operands.

    reg signed [7:0] fwd_val;
    reg fwd_valid;

    // Decode instruction fields
    // F Stage
    wire [7:0] f_instr = IMEM_INIT[f_x_pc*8 +: 8]; // This is not correct for current cycle fetch
    // Let's define the current PC for fetch
    reg [4:0] pc_curr;
    reg [4:0] pc_next;
    
    // The instruction fetched in the current cycle is based on pc_curr.
    // But we need to store it in f_x_instr for the next cycle.
    wire [7:0] instr_fetch = IMEM_INIT[pc_curr*8 +: 8];

    // X Stage Decode
    wire [1:0] x_op = f_x_instr[7:6];
    wire [1:0] x_rd = f_x_instr[5:4];
    wire [3:0] x_imm = f_x_instr[3:0];
    wire [1:0] x_rs = f_x_instr[3:2]; // For MOV, bits 3:2 are rs. For ADDI, bits 3:0 are imm.
    // Wait, the encoding:
    // ADDI: 00 rd[2] imm[4] -> bits 7:6=00, 5:4=rd, 3:0=imm
    // MOV:  01 rd[2] rs[2] _[2] -> bits 7:6=01, 5:4=rd, 3:2=rs, 1:0=xx
    // BRcc: 10 cond[2] off[4] -> bits 7:6=10, 5:4=cond, 3:0=off
    // HALT: 11 _ _ _ _ _ _ _ _ -> bits 7:6=11

    wire signed [7:0] x_imm_sext = {{4{x_imm[3]}}, x_imm};
    wire signed [7:0] x_off_sext = {{4{f_x_instr[3]}}, f_x_instr[3:0]};

    // Read Register File for X stage
    // We need to read R[x_rs] for MOV.
    // We need to read R[x_rd] for ADDI (as source).
    // Forwarding: If x_rd (of W stage) matches the source register of X stage, use W result.
    
    reg signed [7:0] x_rs_val;
    reg signed [7:0] x_rd_val; // For ADDI, rd is also source

    // Forwarding from W stage to X stage
    // W stage has x_w_rd and x_w_result.
    // X stage needs R[x_rs] for MOV and R[x_rd] for ADDI.
    
    // Let's define the forwarding logic in the always block.

    // Branch Condition Evaluation in X stage
    wire x_cond_always = (f_x_instr[5:4] == 2'b00);
    wire x_cond_zero = (f_x_instr[5:4] == 2'b01) && (x_w_r0_val == 0); // Wait, branch condition depends on R0.
    // The branch instruction itself doesn't modify R0. It reads R0.
    // So we need the current value of R0.
    // But R0 might be updated by the instruction in W stage?
    // No, the instruction in W stage is older than the branch in X stage?
    // Let's trace:
    // Cycle N: Instr A (W), Instr B (X), Instr C (F)
    // Instr B is a branch. It needs R0.
    // Instr A is in W stage. It might write R0.
    // If Instr A writes R0, and Instr B reads R0, we should forward A's result to B.
    // So x_w_r0_val should be the value of R0 after Instr A executes.
    // But Instr A is in W stage, so its result is x_w_result.
    // If Instr A writes R0, then x_w_rd == 0, so R0 becomes x_w_result.
    // Otherwise, R0 is rf[0].
    // So the value of R0 for the branch condition is:
    // if (x_w_valid && x_w_rd == 0) ? x_w_result : rf[0];
    // But wait, the branch instruction is in X stage. The W stage instruction is the one
    // that was in X stage last cycle.
    // So yes, we can forward from W to X for the R0 read.

    reg signed [7:0] r0_for_branch;
    wire x_cond_zero_met = (f_x_instr[5:4] == 2'b01) && (r0_for_branch == 0);
    wire x_cond_neg_met = (f_x_instr[5:4] == 2'b10) && (r0_for_branch < 0);
    wire x_cond_pos_met = (f_x_instr[5:4] == 2'b11) && (r0_for_branch > 0);
    wire x_branch_taken = x_cond_always | x_cond_zero_met | x_cond_neg_met | x_cond_pos_met;

    // ALU Result for X stage
    reg signed [7:0] x_alu_result;
    reg [1:0] x_next_op;
    reg [1:0] x_next_rd;
    reg x_next_is_branch;
    reg x_next_branch_taken;
    reg [4:0] x_next_branch_target;
    reg x_next_valid;

    // W Stage Writeback
    // If x_w_valid, write x_w_result to rf[x_w_rd] if op is ADDI or MOV.
    // If op is HALT, set halt.

    // PC Update
    // If branch taken, PC = branch_target.
    // Else, PC = f_x_pc + 1.
    // But wait, the branch target is computed in X stage.
    // The PC for the next fetch is determined by the branch decision.
    // The instruction after the branch (delay slot) is already fetched.
    // So if branch is taken, the next PC is the target.
    // If branch is not taken, the next PC is f_x_pc + 1 + 1? No.
    // Let's trace carefully.
    // Cycle N:
    // F: Fetch Instr C at PC=2
    // X: Decode Instr B at PC=1 (Branch)
    // W: Commit Instr A at PC=0
    // If Instr B is taken, the next instruction to execute after the delay slot (C) is at Target.
    // So in Cycle N+1:
    // F: Fetch Instr D at Target
    // X: Decode Instr C at PC=2
    // W: Commit Instr B (no write)
    // So the PC for fetch in Cycle N+1 should be Target if taken, else PC=3.
    // The branch target is PC_branch + sign_ext(off) + 1.
    // PC_branch is f_x_pc in Cycle N.
    // So target = f_x_pc + x_off_sext + 1.
    
    // Reset
    always @(posedge clk) begin
        if (rst) begin
            pc_curr <= 0;
            f_x_instr <= 0;
            f_x_pc <= 0;
            f_x_valid <= 0;
            x_w_op <= 0;
            x_w_rd <= 0;
            x_w_result <= 0;
            x_w_cond <= 0;
            x_w_r0_val <= 0;
            x_w_valid <= 0;
            x_w_is_branch <= 0;
            x_w_branch_taken <= 0;
            x_w_branch_target <= 0;
            halt_out <= 0;
            pc_out <= 0;
            r0_out <= 0;
            for (int i = 0; i < 4; i++) rf[i] <= 0;
        end else begin
            // Update Register File
            if (x_w_valid) begin
                if (x_w_op == 2'b01 || x_w_op == 2'b10) begin // ADDI or MOV
                    rf[x_w_rd] <= x_w_result;
                end else if (x_w_op == 2'b11) begin // HALT
                    halt_out <= 1;
                end
            end

            // Update Outputs
            pc_out <= pc_curr;
            r0_out <= rf[0];

            // Advance Pipeline
            // F -> X
            f_x_instr <= instr_fetch;
            f_x_pc <= pc_curr;
            f_x_valid <= ~halt_out; // If halt, freeze pipeline

            // X -> W
            x_w_op <= x_next_op;
            x_w_rd <= x_next_rd;
            x_w_result <= x_alu_result;
            x_w_cond <= f_x_instr[5:4];
            x_w_r0_val <= r0_for_branch; // This is the R0 value used for branch condition
            x_w_valid <= x_next_valid;
            x_w_is_branch <= x_next_is_branch;
            x_w_branch_taken <= x_next_branch_taken;
            x_w_branch_target <= x_next_branch_target;

            // Update PC
            if (halt_out) begin
                pc_curr <= pc_curr; // Freeze
            end else if (x_w_is_branch && x_w_branch_taken) begin
                pc_curr <= x_w_branch_target;
            end else begin
                pc_curr <= pc_curr + 1;
            end
        end
    end

    // Combinational Logic for X Stage
    always @(*) begin
        // Default
        x_next_op = 2'b00;
        x_next_rd = 2'b00;
        x_alu_result = 0;
        x_next_valid = 0;
        x_next_is_branch = 0;
        x_next_branch_taken = 0;
        x_next_branch_target = 0;
        r0_for_branch = rf[0];

        // Forwarding for R0 in Branch Condition
        if (f_x_valid) begin
            // Check if W stage writes R0
            if (x_w_valid && x_w_rd == 0 && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
                r0_for_branch = x_w_result;
            end else begin
                r0_for_branch = rf[0];
            end
        end else begin
            r0_for_branch = rf[0];
        end

        if (!f_x_valid) begin
            x_next_valid = 0;
        end else begin
            x_next_valid = 1;
            case (f_x_instr[7:6])
                2'b00: begin // ADDI
                    x_next_op = 2'b01;
                    x_next_rd = f_x_instr[5:4];
                    // Read R[rd]
                    signed [7:0] src_val;
                    // Forwarding for source register
                    if (x_w_valid && x_w_rd == f_x_instr[5:4] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
                        src_val = x_w_result;
                    end else begin
                        src_val = rf[f_x_instr[5:4]];
                    end
                    x_alu_result = src_val + x_imm_sext;
                    x_next_is_branch = 0;
                end
                2'b01: begin // MOV
                    x_next_op = 2'b10;
                    x_next_rd = f_x_instr[5:4];
                    // Read R[rs]
                    signed [7:0] src_val;
                    if (x_w_valid && x_w_rd == f_x_instr[3:2] && (x_w_op == 2'b01 || x_w_op == 2'b10)) begin
                        src_val = x_w_result;
                    end else begin
                        src_val = rf[f_x_instr[3:2]];
                    end
                    x_alu_result = src_val;
                    x_next_is_branch = 0;
                end
                2'b10: begin // BRcc
                    x_next_op = 2'b00; // No write
                    x_next_rd = 0;
                    x_next_is_branch = 1;
                    x_next_branch_taken = x_branch_taken;
                    x_next_branch_target = f_x_pc + x_off_sext + 1;
                    x_alu_result = 0;
                end
                2'b11: begin // HALT
                    x_next_op = 2'b11;
                    x_next_rd = 0;
                    x_next_is_branch = 0;
                    x_alu_result = 0;
                end
                default: begin
                    x_next_op = 2'b00;
                    x_next_is_branch = 0;
                end
            endcase
        end
    end

endmodule
```

line-item PPA

per-program functional

pipeline shape (heuristic register bank count)

mistakes detected

extracted RTL

raw model output