sft_1

extracted RTL

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

// Register file: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];

// Pipeline registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;

// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg signed [7:0] rs_val_x;  // Read register value (after forwarding)
reg [1:0] cond_x;           // Condition for branch
reg [3:0] off_x;            // Offset for branch
reg [1:0] rd_x;             // Destination register
reg [1:0] rs_x;             // Source register (for MOV)
reg [3:0] imm_x;            // Immediate (for ADDI)
reg [1:0] opcode_x;         // Opcode
reg branch_taken_x;         // Branch condition met?
reg is_halt_x;              // Is HALT?
reg signed [7:0] alu_res_x; // ALU result

// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] rd_w;             // Destination register
reg signed [7:0] result_w;  // Result to write
reg is_halt_w;              // Is HALT?
reg valid_w;                // Valid instruction in W stage

// Forwarding logic
// We need to forward from X stage and W stage to the X stage of the current instruction.
// The X stage instruction reads registers. If the source register matches a destination
// register being written by X or W stage, we forward.

// Forwarding from X stage to X stage (current instruction's X stage reads from previous instruction's X stage result)
// Forwarding from W stage to X stage (current instruction's X stage reads from instruction before previous's W stage result)

reg signed [7:0] fwd_val_x;
reg fwd_en_x;

// Determine forwarding for the instruction in X stage
always @(*) begin
    // Default: no forwarding, use register file value
    fwd_en_x = 0;
    fwd_val_x = rf[rs_x];

    // Check if we should forward from W stage (instruction in W stage writes to rd_w)
    // The instruction in W stage is the one that was in X stage 2 cycles ago.
    // It writes to rd_w. If rs_x == rd_w and rd_w != 0 and valid_w, forward from W.
    if (valid_w && (rd_w != 2'b00) && (rs_x == rd_w)) begin
        fwd_en_x = 1;
        fwd_val_x = result_w;
    end
    // Check if we should forward from X stage (instruction in X stage writes to rd_x)
    // The instruction in X stage is the one that was in F stage 1 cycle ago.
    // It computes alu_res_x. If rs_x == rd_x and rd_x != 0, forward from X.
    // Note: X stage forwarding takes priority over W stage forwarding if both match,
    // because X stage result is more recent.
    else if ((rd_x != 2'b00) && (rs_x == rd_x)) begin
        fwd_en_x = 1;
        fwd_val_x = alu_res_x;
    end
end

// Helper function to sign-extend 4-bit immediate to 8-bit
function signed [7:0] sign_ext4;
    input [3:0] val;
    begin
        sign_ext4 = {{4{val[3]}}, val};
    end
endfunction

// Main sequential logic
always @(posedge clk) begin
    if (rst) begin
        // Reset everything
        pc_f <= 5'b0;
        instr_f <= 8'b0;
        pc_x <= 5'b0;
        instr_x <= 8'b0;
        rs_val_x <= 8'b0;
        cond_x <= 2'b0;
        off_x <= 4'b0;
        rd_x <= 2'b0;
        rs_x <= 2'b0;
        imm_x <= 4'b0;
        opcode_x <= 2'b0;
        branch_taken_x <= 1'b0;
        is_halt_x <= 1'b0;
        alu_res_x <= 8'b0;
        pc_w <= 5'b0;
        instr_w <= 8'b0;
        rd_w <= 2'b0;
        result_w <= 8'b0;
        is_halt_w <= 1'b0;
        valid_w <= 1'b0;
        pc_out <= 5'b0;
        halt_out <= 1'b0;
        r0_out <= 8'b0;
        rf[0] <= 8'b0;
        rf[1] <= 8'b0;
        rf[2] <= 8'b0;
        rf[3] <= 8'b0;
    end else if (halt_out) begin
        // Pipeline frozen: keep current state
        // pc_out remains at the HALT instruction's PC
        // r0_out remains unchanged
        // All pipeline registers hold their last values
    end else begin
        // --- W Stage (Writeback) ---
        // Write result to register file if not HALT and rd != 0
        if (valid_w && !is_halt_w && (rd_w != 2'b00)) begin
            rf[rd_w] <= result_w;
        end

        // Update r0_out if R0 was written
        if (valid_w && !is_halt_w && (rd_w == 2'b00)) begin
            r0_out <= result_w;
        end else if (valid_w && is_halt_w) begin
            // HALT: r0_out unchanged
        end

        // Move X to W
        pc_w <= pc_x;
        instr_w <= instr_x;
        rd_w <= rd_x;
        result_w <= alu_res_x;
        is_halt_w <= is_halt_x;
        valid_w <= 1'b1;

        // Update halt_out if HALT reached W stage
        if (valid_w && is_halt_w) begin
            halt_out <= 1'b1;
        end

        // --- X Stage (Decode/Execute) ---
        // Decode instruction in X stage
        opcode_x <= instr_x[7:6];
        case (opcode_x)
            2'b00: begin // ADDI
                rd_x <= instr_x[5:4];
                imm_x <= instr_x[3:0];
                rs_x <= 2'b00; // Not used
                cond_x <= 2'b00; // Not used
                off_x <= 4'b0; // Not used
                is_halt_x <= 1'b0;
                branch_taken_x <= 1'b0;
            end
            2'b01: begin // MOV
                rd_x <= instr_x[5:4];
                rs_x <= instr_x[3:2];
                imm_x <= 4'b0; // Not used
                cond_x <= 2'b00; // Not used
                off_x <= 4'b0; // Not used
                is_halt_x <= 1'b0;
                branch_taken_x <= 1'b0;
            end
            2'b10: begin // BRcc
                cond_x <= instr_x[5:4];
                off_x <= instr_x[3:0];
                rd_x <= 2'b00; // Not used
                rs_x <= 2'b00; // Not used
                imm_x <= 4'b0; // Not used
                is_halt_x <= 1'b0;
            end
            2'b11: begin // HALT
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0;
                cond_x <= 2'b00;
                off_x <= 4'b0;
                is_halt_x <= 1'b1;
                branch_taken_x <= 1'b0;
            end
            default: begin
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0;
                cond_x <= 2'b00;
                off_x <= 4'b0;
                is_halt_x <= 1'b0;
                branch_taken_x <= 1'b0;
            end
        endcase

        // Execute ALU/Branch
        case (opcode_x)
            2'b00: begin // ADDI
                alu_res_x <= rf[rd_x] + sign_ext4(imm_x);
                branch_taken_x <= 1'b0;
            end
            2'b01: begin // MOV
                alu_res_x <= rf[rs_x];
                branch_taken_x <= 1'b0;
            end
            2'b10: begin // BRcc
                // Evaluate branch condition based on R0
                case (cond_x)
                    2'b00: branch_taken_x = 1'b1; // Always
                    2'b01: branch_taken_x = (rf[0] == 8'b0); // Zero
                    2'b10: branch_taken_x = (rf[0] < 8'b0);  // Negative
                    2'b11: branch_taken_x = (rf[0] > 8'b0);  // Positive
                    default: branch_taken_x = 1'b0;
                endcase
                alu_res_x <= 8'b0; // Not used
            end
            2'b11: begin // HALT
                alu_res_x <= 8'b0;
                branch_taken_x <= 1'b0;
            end
            default: begin
                alu_res_x <= 8'b0;
                branch_taken_x <= 1'b0;
            end
        endcase

        // --- F Stage (Fetch) ---
        // Compute next PC
        // If the instruction in X stage was a taken branch, the next PC after the delay slot
        // should be the branch target. But the delay slot instruction is the one currently in W stage.
        // So: if branch was taken in X stage, the instruction in W stage is the delay slot.
        // After W stage completes, the next fetch should be the branch target.
        // However, we need to handle the PC update carefully.
        
        // Current F stage PC is pc_f. The next instruction to fetch depends on whether
        // the branch in the X stage (which is the instruction at pc_x) was taken.
        // But wait: the X stage instruction is the one fetched in the previous cycle.
        // The W stage instruction is the one fetched 2 cycles ago.
        
        // Let's think about the pipeline timing:
        // Cycle N: F fetches PC=N, X executes PC=N-1, W writes PC=N-2
        // If X stage (PC=N-1) is a taken branch, the delay slot is PC=N (currently in F).
        // After cycle N, the delay slot (PC=N) moves to X, and the next fetch should be
        // the branch target if the branch was taken.
        
        // So: if the instruction in X stage is a taken branch, the next PC (for the cycle after the delay slot)
        // is the branch target. The delay slot instruction (currently in F) will execute normally.
        
        // Compute branch target: pc_x + sign_ext(off_x, 4) + 1
        signed [7:0] branch_target = pc_x + sign_ext4(off_x) + 1;
        
        // Determine next PC for F stage
        // If the instruction in W stage was a taken branch, then the current F stage instruction
        // is the delay slot, and the next F stage should fetch the branch target.
        // Wait, let's reconsider. The branch instruction is in X stage. The delay slot is in F stage.
        // After this cycle, the delay slot moves to X, and we need to fetch the branch target (if taken)
        // or pc_f + 1 (if not taken) for the next F stage.
        
        // Actually, the standard approach:
        // - If the branch in X stage is taken, the next PC after the delay slot is the branch target.
        // - The delay slot instruction is the one currently in F stage.
        // - So for the next cycle, F stage should fetch from branch_target if branch_taken_x is true,
        //   otherwise fetch from pc_f + 1.
        
        // But we also need to consider: what if the branch is in W stage? No, branches are handled in X stage.
        // The W stage only writes results. The branch decision is made in X stage.
        
        // Let's use the branch_taken from the instruction in X stage to determine the next PC.
        // The instruction in X stage is at pc_x. If it's a taken branch, the delay slot is at pc_f.
        // After this cycle, the next instruction to fetch is the branch target.
        
        if (opcode_x == 2'b10 && branch_taken_x) begin
            pc_f <= branch_target;
        end else begin
            pc_f <= pc_f + 5'b1;
        end
        
        // Fetch instruction from IMEM
        // IMEM_INIT is 256 bits, byte 0 at MSB.
        // Byte at address pc_f is at bits [pc_f*8 +: 8] from the MSB side.
        // Since byte 0 is at MSB, byte N is at bits [255 - N*8 -: 8].
        // Alternatively, we can index as IMEM_INIT[255 - pc_f*8 -: 8].
        instr_f <= IMEM_INIT[255 - pc_f*8 -: 8];

        // Move F to X
        pc_x <= pc_f;
        instr_x <= instr_f;

        // Update pc_out to reflect the PC currently in F stage
        pc_out <= pc_f;
    end
end

endmodule

raw model output

```verilog
// Pipeline convention:
// - Branch delay slot: The instruction immediately following a BRcc is always fetched and executed.
//   The branch target (if taken) is fetched in the cycle after the delay slot instruction completes W stage.
// - Forwarding: Results from X stage are forwarded to X stage inputs of the next instruction.
//   Results from W stage are forwarded to X stage inputs of the instruction after that.
// - HALT: When a HALT instruction reaches W stage, halt_out is asserted and the pipeline freezes.
//   pc_out remains at the PC of the HALT instruction. r0_out remains unchanged.

module mcu3 #(
    parameter [0:255] IMEM_INIT = 256'h0  // 32 instruction bytes, byte 0 at MSB
) (
    input               clk,
    input               rst,             // synchronous, active-high
    output reg [4:0]    pc_out,          // current fetched PC (visible for testing)
    output reg          halt_out,        // asserted forever after HALT executes
    output reg signed [7:0] r0_out       // current value of register R0
);

// Register file: 4 registers, 8-bit signed
reg signed [7:0] rf [0:3];

// Pipeline registers
// Stage F (Fetch)
reg [4:0] pc_f;
reg [7:0] instr_f;

// Stage X (Execute)
reg [4:0] pc_x;
reg [7:0] instr_x;
reg signed [7:0] rs_val_x;  // Read register value (after forwarding)
reg [1:0] cond_x;           // Condition for branch
reg [3:0] off_x;            // Offset for branch
reg [1:0] rd_x;             // Destination register
reg [1:0] rs_x;             // Source register (for MOV)
reg [3:0] imm_x;            // Immediate (for ADDI)
reg [1:0] opcode_x;         // Opcode
reg branch_taken_x;         // Branch condition met?
reg is_halt_x;              // Is HALT?
reg signed [7:0] alu_res_x; // ALU result

// Stage W (Writeback)
reg [4:0] pc_w;
reg [7:0] instr_w;
reg [1:0] rd_w;             // Destination register
reg signed [7:0] result_w;  // Result to write
reg is_halt_w;              // Is HALT?
reg valid_w;                // Valid instruction in W stage

// Forwarding logic
// We need to forward from X stage and W stage to the X stage of the current instruction.
// The X stage instruction reads registers. If the source register matches a destination
// register being written by X or W stage, we forward.

// Forwarding from X stage to X stage (current instruction's X stage reads from previous instruction's X stage result)
// Forwarding from W stage to X stage (current instruction's X stage reads from instruction before previous's W stage result)

reg signed [7:0] fwd_val_x;
reg fwd_en_x;

// Determine forwarding for the instruction in X stage
always @(*) begin
    // Default: no forwarding, use register file value
    fwd_en_x = 0;
    fwd_val_x = rf[rs_x];

    // Check if we should forward from W stage (instruction in W stage writes to rd_w)
    // The instruction in W stage is the one that was in X stage 2 cycles ago.
    // It writes to rd_w. If rs_x == rd_w and rd_w != 0 and valid_w, forward from W.
    if (valid_w && (rd_w != 2'b00) && (rs_x == rd_w)) begin
        fwd_en_x = 1;
        fwd_val_x = result_w;
    end
    // Check if we should forward from X stage (instruction in X stage writes to rd_x)
    // The instruction in X stage is the one that was in F stage 1 cycle ago.
    // It computes alu_res_x. If rs_x == rd_x and rd_x != 0, forward from X.
    // Note: X stage forwarding takes priority over W stage forwarding if both match,
    // because X stage result is more recent.
    else if ((rd_x != 2'b00) && (rs_x == rd_x)) begin
        fwd_en_x = 1;
        fwd_val_x = alu_res_x;
    end
end

// Helper function to sign-extend 4-bit immediate to 8-bit
function signed [7:0] sign_ext4;
    input [3:0] val;
    begin
        sign_ext4 = {{4{val[3]}}, val};
    end
endfunction

// Main sequential logic
always @(posedge clk) begin
    if (rst) begin
        // Reset everything
        pc_f <= 5'b0;
        instr_f <= 8'b0;
        pc_x <= 5'b0;
        instr_x <= 8'b0;
        rs_val_x <= 8'b0;
        cond_x <= 2'b0;
        off_x <= 4'b0;
        rd_x <= 2'b0;
        rs_x <= 2'b0;
        imm_x <= 4'b0;
        opcode_x <= 2'b0;
        branch_taken_x <= 1'b0;
        is_halt_x <= 1'b0;
        alu_res_x <= 8'b0;
        pc_w <= 5'b0;
        instr_w <= 8'b0;
        rd_w <= 2'b0;
        result_w <= 8'b0;
        is_halt_w <= 1'b0;
        valid_w <= 1'b0;
        pc_out <= 5'b0;
        halt_out <= 1'b0;
        r0_out <= 8'b0;
        rf[0] <= 8'b0;
        rf[1] <= 8'b0;
        rf[2] <= 8'b0;
        rf[3] <= 8'b0;
    end else if (halt_out) begin
        // Pipeline frozen: keep current state
        // pc_out remains at the HALT instruction's PC
        // r0_out remains unchanged
        // All pipeline registers hold their last values
    end else begin
        // --- W Stage (Writeback) ---
        // Write result to register file if not HALT and rd != 0
        if (valid_w && !is_halt_w && (rd_w != 2'b00)) begin
            rf[rd_w] <= result_w;
        end

        // Update r0_out if R0 was written
        if (valid_w && !is_halt_w && (rd_w == 2'b00)) begin
            r0_out <= result_w;
        end else if (valid_w && is_halt_w) begin
            // HALT: r0_out unchanged
        end

        // Move X to W
        pc_w <= pc_x;
        instr_w <= instr_x;
        rd_w <= rd_x;
        result_w <= alu_res_x;
        is_halt_w <= is_halt_x;
        valid_w <= 1'b1;

        // Update halt_out if HALT reached W stage
        if (valid_w && is_halt_w) begin
            halt_out <= 1'b1;
        end

        // --- X Stage (Decode/Execute) ---
        // Decode instruction in X stage
        opcode_x <= instr_x[7:6];
        case (opcode_x)
            2'b00: begin // ADDI
                rd_x <= instr_x[5:4];
                imm_x <= instr_x[3:0];
                rs_x <= 2'b00; // Not used
                cond_x <= 2'b00; // Not used
                off_x <= 4'b0; // Not used
                is_halt_x <= 1'b0;
                branch_taken_x <= 1'b0;
            end
            2'b01: begin // MOV
                rd_x <= instr_x[5:4];
                rs_x <= instr_x[3:2];
                imm_x <= 4'b0; // Not used
                cond_x <= 2'b00; // Not used
                off_x <= 4'b0; // Not used
                is_halt_x <= 1'b0;
                branch_taken_x <= 1'b0;
            end
            2'b10: begin // BRcc
                cond_x <= instr_x[5:4];
                off_x <= instr_x[3:0];
                rd_x <= 2'b00; // Not used
                rs_x <= 2'b00; // Not used
                imm_x <= 4'b0; // Not used
                is_halt_x <= 1'b0;
            end
            2'b11: begin // HALT
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0;
                cond_x <= 2'b00;
                off_x <= 4'b0;
                is_halt_x <= 1'b1;
                branch_taken_x <= 1'b0;
            end
            default: begin
                rd_x <= 2'b00;
                rs_x <= 2'b00;
                imm_x <= 4'b0;
                cond_x <= 2'b00;
                off_x <= 4'b0;
                is_halt_x <= 1'b0;
                branch_taken_x <= 1'b0;
            end
        endcase

        // Execute ALU/Branch
        case (opcode_x)
            2'b00: begin // ADDI
                alu_res_x <= rf[rd_x] + sign_ext4(imm_x);
                branch_taken_x <= 1'b0;
            end
            2'b01: begin // MOV
                alu_res_x <= rf[rs_x];
                branch_taken_x <= 1'b0;
            end
            2'b10: begin // BRcc
                // Evaluate branch condition based on R0
                case (cond_x)
                    2'b00: branch_taken_x = 1'b1; // Always
                    2'b01: branch_taken_x = (rf[0] == 8'b0); // Zero
                    2'b10: branch_taken_x = (rf[0] < 8'b0);  // Negative
                    2'b11: branch_taken_x = (rf[0] > 8'b0);  // Positive
                    default: branch_taken_x = 1'b0;
                endcase
                alu_res_x <= 8'b0; // Not used
            end
            2'b11: begin // HALT
                alu_res_x <= 8'b0;
                branch_taken_x <= 1'b0;
            end
            default: begin
                alu_res_x <= 8'b0;
                branch_taken_x <= 1'b0;
            end
        endcase

        // --- F Stage (Fetch) ---
        // Compute next PC
        // If the instruction in X stage was a taken branch, the next PC after the delay slot
        // should be the branch target. But the delay slot instruction is the one currently in W stage.
        // So: if branch was taken in X stage, the instruction in W stage is the delay slot.
        // After W stage completes, the next fetch should be the branch target.
        // However, we need to handle the PC update carefully.
        
        // Current F stage PC is pc_f. The next instruction to fetch depends on whether
        // the branch in the X stage (which is the instruction at pc_x) was taken.
        // But wait: the X stage instruction is the one fetched in the previous cycle.
        // The W stage instruction is the one fetched 2 cycles ago.
        
        // Let's think about the pipeline timing:
        // Cycle N: F fetches PC=N, X executes PC=N-1, W writes PC=N-2
        // If X stage (PC=N-1) is a taken branch, the delay slot is PC=N (currently in F).
        // After cycle N, the delay slot (PC=N) moves to X, and the next fetch should be
        // the branch target if the branch was taken.
        
        // So: if the instruction in X stage is a taken branch, the next PC (for the cycle after the delay slot)
        // is the branch target. The delay slot instruction (currently in F) will execute normally.
        
        // Compute branch target: pc_x + sign_ext(off_x, 4) + 1
        signed [7:0] branch_target = pc_x + sign_ext4(off_x) + 1;
        
        // Determine next PC for F stage
        // If the instruction in W stage was a taken branch, then the current F stage instruction
        // is the delay slot, and the next F stage should fetch the branch target.
        // Wait, let's reconsider. The branch instruction is in X stage. The delay slot is in F stage.
        // After this cycle, the delay slot moves to X, and we need to fetch the branch target (if taken)
        // or pc_f + 1 (if not taken) for the next F stage.
        
        // Actually, the standard approach:
        // - If the branch in X stage is taken, the next PC after the delay slot is the branch target.
        // - The delay slot instruction is the one currently in F stage.
        // - So for the next cycle, F stage should fetch from branch_target if branch_taken_x is true,
        //   otherwise fetch from pc_f + 1.
        
        // But we also need to consider: what if the branch is in W stage? No, branches are handled in X stage.
        // The W stage only writes results. The branch decision is made in X stage.
        
        // Let's use the branch_taken from the instruction in X stage to determine the next PC.
        // The instruction in X stage is at pc_x. If it's a taken branch, the delay slot is at pc_f.
        // After this cycle, the next instruction to fetch is the branch target.
        
        if (opcode_x == 2'b10 && branch_taken_x) begin
            pc_f <= branch_target;
        end else begin
            pc_f <= pc_f + 5'b1;
        end
        
        // Fetch instruction from IMEM
        // IMEM_INIT is 256 bits, byte 0 at MSB.
        // Byte at address pc_f is at bits [pc_f*8 +: 8] from the MSB side.
        // Since byte 0 is at MSB, byte N is at bits [255 - N*8 -: 8].
        // Alternatively, we can index as IMEM_INIT[255 - pc_f*8 -: 8].
        instr_f <= IMEM_INIT[255 - pc_f*8 -: 8];

        // Move F to X
        pc_x <= pc_f;
        instr_x <= instr_f;

        // Update pc_out to reflect the PC currently in F stage
        pc_out <= pc_f;
    end
end

endmodule
```

line-item PPA

per-program functional

pipeline shape (heuristic register bank count)

mistakes detected

extracted RTL

raw model output